true
library(tidyverse)
library(dplyr)
library(readr)
library(readxl)
library(vdemdata)
library(survey) 
library(countrycode)
library(rvest)
library(stringr)
library(mice)       
library(VIM)
library(visdat)    
library(naniar)     
library(caret) 
library(haven)
library(magrittr)
library(sjlabelled)
library(mice)
library(tidyr)
library(ggplot2)
library(scales)
library(viridis)
library(kableExtra)
library(corrplot)
library(cluster)
library(factoextra)
library(ggrepel)
library(randomForest)
library(DALEX) 
library(ranger) 
library(lme4)      
library(lmerTest)  
library(sjstats)   
library(sjPlot)    
library(modelsummary)
library(broom.mixed)  
library(stargazer)
library(car)
library(rtf)
library(MuMIn)

External data

Country mapping

# mapping -----------------------------------------------------------------
# we comment out the country codes as we likely don't need them
survey_country_mapping <- data.frame(
  country_name = c("France", "Belgium", "Netherlands", "Germany", "Italy", 
                   "Luxembourg", "Denmark", "Ireland", "United Kingdom", "Greece", 
                   "Spain", "Portugal", "Finland", "Sweden", 
                   "Austria", "Cyprus", "Czech Republic", "Estonia", "Hungary", 
                   "Latvia", "Lithuania", "Malta", "Poland", "Slovakia", 
                   "Slovenia", "Bulgaria", "Romania", "Croatia"),
  #survey_country_code = c(1, 2, 3, 4, 5, 
  #                        6, 7, 8, 9, 11, 
  #                        12, 13, 16, 17, 
  #                        18, 19, 20, 21, 22, 
  #                        23, 24, 25, 26, 27, 
  #                        28, 29, 30, 32),
  iso2 = c("FR", "BE", "NL", "DE", "IT", 
           "LU", "DK", "IE", "GB", "GR", 
           "ES", "PT", "FI", "SE", 
           "AT", "CY", "CZ", "EE", "HU", 
           "LV", "LT", "MT", "PL", "SK", 
           "SI", "BG", "RO", "HR"),
  stringsAsFactors = FALSE)

# create a mapping for country name standardization: the Czech Republic is a problem
country_name_mapping <- c("Czechia" = "Czech Republic")

V-Dem dataset

  • v2x_libdem: index of liberal democracy
  • v2x_polyarchy: index of electoral democracy
  • v2x_gender: index of women’s political empowerment
  • v2x_egaldem: index of egalitarian democracy
  • v2x_liberal: index of civil liberties
  • v2xcs_ccsi: index of civil society participation
  • v2x_freexp: index of freedom of expression
# 1. vdemdata ----------------------------------------------------------------
# install the package from GitHub first:
# devtools::install_github("vdeminstitute/vdemdata")
vdem_data <- vdem

# apply country name standardization before filtering
vdem_data_standardized <- vdem_data %>%
  mutate(
    # Standardize country names
    country_name = ifelse(country_name %in% names(country_name_mapping),
                          country_name_mapping[country_name],
                          country_name))

# filter for most recent data (2019 to match survey data)
vdem_2019 <- vdem_data_standardized %>%
  filter(country_name %in% survey_country_mapping$country_name, year == 2019) %>%
  select(country_name, v2x_libdem, v2x_polyarchy, v2x_gender, 
         v2x_egaldem, v2x_liberal, v2xcs_ccsi, v2x_freexp)  # select relevant variables

saveRDS(vdem_2019, file = "vdem_eu_2019.rds")
write.csv(vdem_2019, "vdem_eu_2019.csv")

Rainbow map scores

# 2. Rainbowmap --------------------------------------------------------------
# https://rainbowmap.ilga-europe.org/

# rainbow_data <- read_csv("data/raw/2024-rainbow-map-data.csv")

# but the problem: that's for 2024, not 2019 or before 2019
# hence, we would need to get the data for 2019 and years before:

# create data frames for each year
df_2019 <- data.frame(
  Country = c("Malta", "Belgium", "United Kingdom", "Norway", "France", "Finland",
              "Denmark", "Spain", "Portugal", "Sweden", "Netherlands", "Austria",
              "Germany", "Croatia", "Greece", "Ireland", "Hungary", "Luxembourg",
              "Iceland", "Slovenia", "Montenegro", "Estonia", "Switzerland", "Georgia",
              "Bosnia & Herzegovina", "Slovakia", "Albania", "Serbia", "Bulgaria", 
              "Czechia", "Kosovo", "Andorra", "Cyprus", "Romania", "Ukraine", 
              "Lithuania", "Italy", "Poland", "Latvia", "Belarus", "Moldova", 
              "Russia", "North Macedonia", "Liechtenstein", "San Marino", "Armenia",
              "Turkey", "Monaco", "Azerbaijan"),
  Value = c(74.72, 70.40, 67.62, 63.64, 62.73, 62.20, 60.31, 58.49, 57.50, 52.86,
            49.72, 48.54, 47.45, 45.83, 44.82, 44.72, 42.83, 41.34, 40.20, 39.82, 
            37.16, 34.38, 30.06, 29.49, 29.37, 29.04, 27.74, 26.43, 25.75, 25.50,
            25.34, 23.93, 22.03, 20.67, 19.97, 19.91, 19.43, 18.10, 16.83, 15.82,
            15.10, 11.75, 10.88, 10.08, 9.87, 8.50, 8.50, 7.31, 5.67),
  Year = 2019)

df_2018 <- data.frame(
  Country = c("Malta", "Belgium", "United Kingdom", "Finland", "France", "Norway",
              "Portugal", "Denmark", "Spain", "Sweden", "Netherlands", "Germany",
              "Austria", "Greece", "Ireland", "Croatia", "Slovenia", "Luxembourg",
              "Iceland", "Hungary", "Estonia", "Switzerland", "Montenegro", "Andorra",
              "Albania", "Kosovo", "Bosnia & Herzegovina", "Serbia", "Czechia", 
              "Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Lithuania",
              "Romania", "Ukraine", "Poland", "Liechtenstein", "Latvia", 
              "North Macedonia", "Belarus", "Moldova", "San Marino", "Russia", 
              "Monaco", "Turkey", "Armenia", "Azerbaijan"),
  Value = c(91.94, 78.70, 73.48, 73.27, 72.81, 72.74, 69.16, 67.69, 67.03, 60.10,
            59.64, 59.00, 56.40, 52.32, 52.22, 50.58, 47.73, 47.48, 47.22, 47.16,
            39.34, 38.44, 37.74, 34.81, 33.24, 32.98, 31.38, 29.68, 29.20, 28.95,
            28.65, 28.82, 25.87, 24.15, 21.78, 21.12, 20.95, 18.23, 17.87, 16.07,
            14.03, 13.35, 13.08, 12.32, 10.80, 9.78, 8.60, 7.20, 4.70),
  Year = 2018)

df_2017 <- data.frame(
  Country = c("Malta", "Norway", "United Kingdom", "Belgium", "France", "Portugal",
              "Finland", "Denmark", "Spain", "Netherlands", "Croatia", "Sweden", 
              "Austria", "Germany", "Ireland", "Iceland", "Greece", "Luxembourg", 
              "Hungary", "Slovenia", "Montenegro", "Andorra", "Estonia", "Albania",
              "Bosnia & Herzegovina", "Switzerland", "Kosovo", "Serbia", "Czechia", 
              "Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Romania", 
              "Ukraine", "Poland", "Liechtenstein", "Lithuania", "Latvia", 
              "North Macedonia", "Belarus", "Moldova", "San Marino", "Monaco", 
              "Turkey", "Armenia", "Azerbaijan"),
  Value = c(77.74, 75.73, 71.86, 70.82, 69.16, 68.27, 67.69, 67.03, 64.44, 62.36,
            60.10, 55.58, 54.41, 52.22, 47.22, 46.92, 46.48, 44.82, 44.28, 38.64, 
            34.81, 33.31, 33.24, 31.34, 30.94, 30.48, 29.68, 29.20, 28.95, 27.60, 
            26.67, 25.87, 23.15, 21.12, 19.00, 18.23, 17.87, 17.28, 17.12, 16.03, 
            13.35, 13.08, 12.32, 9.78, 8.60, 7.20, 6.40, 4.70),
  Year = 2017)

df_2016 <- data.frame(
  Country = c("Malta", "Belgium", "United Kingdom", "Spain", "Denmark", "Portugal",
              "Finland", "France", "Croatia", "Netherlands", "Norway", "Sweden", 
              "Austria", "Iceland", "Greece", "Germany", "Ireland", "Hungary", 
              "Luxembourg", "Montenegro", "Estonia", "Albania", "Switzerland", 
              "Andorra", "Serbia", "Cyprus", "Slovenia", "Czechia", "Kosovo", 
              "Georgia", "Bosnia & Herzegovina", "Slovakia", "Bulgaria", "Romania",
              "Italy", "Poland", "Liechtenstein", "Lithuania", "Latvia", 
              "North Macedonia", "San Marino", "Moldova", "Belarus", "Monaco", 
              "Turkey", "Armenia", "Azerbaijan"),
  Value = c(77.75, 81.85, 79.19, 70.95, 70.90, 69.55, 67.25, 66.60, 66.55, 66.10, 
            65.15, 64.85, 62.21, 59.00, 58.30, 55.14, 54.70, 51.40, 50.35, 45.20, 
            38.25, 34.40, 33.15, 32.10, 32.00, 31.95, 31.65, 31.60, 31.55, 30.35, 
            29.40, 29.20, 24.00, 23.45, 19.75, 18.30, 18.20, 18.10, 17.35, 15.55, 
            14.40, 14.15, 13.35, 10.80, 8.75, 7.20, 4.85),
  Year = 2016)

df_2015 <- data.frame(
  Country = c("United Kingdom", "Belgium", "Malta", "Sweden", "Croatia", "Netherlands",
              "Norway", "Spain", "Denmark", "Portugal", "France", "Iceland", "Finland", 
              "Germany", "Austria", "Hungary", "Montenegro", "Luxembourg", "Albania",
              "Ireland", "Greece", "Georgia", "Czechia", "Estonia", "Slovenia",
              "Andorra", "Bosnia & Herzegovina", "Serbia", "Slovakia", "Romania",
              "Switzerland", "Bulgaria", "Poland", "Italy", "Liechtenstein", 
              "Lithuania", "Cyprus", "Kosovo", "Latvia", "Moldova", "Belarus",
              "San Marino", "North Macedonia", "Turkey", "Monaco", "Armenia", 
              "Azerbaijan"),
  Value = c(88.00, 83.00, 77.00, 72.00, 71.00, 69.00, 69.00, 69.00, 68.00, 67.00,
            65.00, 63.00, 62.00, 56.00, 52.00, 50.00, 46.00, 43.00, 42.00, 40.00,
            39.00, 36.00, 35.00, 34.00, 32.00, 31.00, 29.00, 29.00, 29.00, 28.00, 
            28.00, 27.00, 26.00, 22.00, 19.00, 19.00, 18.00, 18.00, 18.00, 16.00,
            14.00, 14.00, 13.00, 12.00, 11.00, 9.00, 5.00),
  Year = 2015)

df_2014 <- data.frame(
  Country = c("United Kingdom", "Belgium", "Spain", "Netherlands", "Norway", 
              "Portugal", "Sweden", "France", "Iceland", "Denmark", "Malta", 
              "Croatia", "Germany", "Hungary", "Austria", "Montenegro", "Finland",
              "Albania", "Slovenia", "Czechia", "Estonia", "Ireland", "Greece", 
              "Slovakia", "Serbia", "Bulgaria", "Switzerland", "Luxembourg", 
              "Romania", "Poland", "Italy", "Georgia", "Lithuania", "Andorra",
              "Bosnia & Herzegovina", "Cyprus", "Latvia", "Liechtenstein", "Kosovo",
              "Moldova", "Turkey", "San Marino", "Belarus", "North Macedonia", 
              "Ukraine", "Monaco", "Armenia", "Azerbaijan"),
  Value = c(80.25, 78.10, 73.26, 69.90, 68.40, 66.60, 65.30, 64.10, 63.95, 59.90,
            56.80, 56.30, 55.68, 53.65, 52.10, 47.05, 45.30, 38.40, 35.00, 34.65, 
            34.65, 33.65, 31.15, 30.50, 30.30, 30.00, 28.85, 28.35, 27.95, 27.65, 
            27.40, 28.05, 21.70, 20.60, 20.10, 19.65, 19.65, 18.00, 17.10, 16.50,
            14.15, 13.70, 13.60, 13.30, 11.65, 10.10, 8.50, 6.60),
  Year = 2014)

df_2013 <- data.frame(
  Country = c("United Kingdom", "Belgium", "Norway", "Sweden", "Spain", "Portugal", 
              "France", "Netherlands", "Denmark", "Iceland", "Hungary", "Germany",
              "Croatia", "Finland", "Austria", "Albania", "Malta", "Slovenia", 
              "Czechia", "Ireland", "Romania", "Estonia", "Switzerland", "Luxembourg",
              "Greece", "Slovakia", "Montenegro", "Serbia", "Poland", "Georgia",
              "Lithuania", "Andorra", "Bosnia & Herzegovina", "Cyprus", "Latvia", 
              "Italy", "Bulgaria", "Liechtenstein", "Turkey", "San Marino", "Belarus",
              "Kosovo", "North Macedonia", "Ukraine", "Monaco", "Armenia", "Azerbaijan"),
  Value = c(78.50, 68.73, 65.65, 65.30, 65.04, 64.60, 64.10, 60.00, 59.80, 55.50,
            54.70, 54.29, 48.30, 47.25, 43.35, 38.40, 35.30, 35.00, 34.65, 33.65, 
            31.30, 28.90, 28.85, 28.35, 28.10, 28.90, 28.65, 25.05, 21.65, 21.05, 
            20.70, 20.60, 19.95, 19.65, 19.65, 19.40, 18.00, 15.50, 14.15, 13.70, 
            13.60, 13.50, 13.30, 11.65, 10.10, 7.50, 7.10),
  Year = 2013)

# combine all data frames into one
df_combined <- bind_rows(df_2019, df_2018, df_2017, df_2016, df_2015, df_2014, df_2013)

# create new, compressed df
# step 1: Filter data for 2019 and 2018
df_2019 <- df_combined %>% filter(Year == 2019) %>% rename(Value_2019 = Value)
df_2018 <- df_combined %>% filter(Year == 2018) %>% rename(Value_2018 = Value)

# step 2: Filter data for 2013 and 2014 and calculate the average
df_2013_2014 <- df_combined %>% filter(Year %in% c(2013, 2014)) %>%
  group_by(Country) %>%
  summarise(Avg_2013_2014 = mean(Value, na.rm = TRUE))

# step 3: Join the data frames for 2019 and 2018
df_compressed <- df_2019 %>%
  left_join(df_2018, by = "Country") %>%
  select(Country, Value_2019, Value_2018)

# step 4: Calculate the average for 2019 and 2018
df_compressed <- df_compressed %>%
  mutate(Avg_2019_2018 = (Value_2019 + Value_2018) / 2)

# step 5: Join the average for 2013 and 2014
df_compressed <- df_compressed %>%
  left_join(df_2013_2014, by = "Country")

# step 6: Calculate the difference between the averages
df_compressed <- df_compressed %>%
  mutate(Difference = Avg_2019_2018 - Avg_2013_2014)

# step 7: Select and reorder columns for the final compressed data frame
df_compressed <- df_compressed %>%
  select(Country, Value_2019, Value_2018, Avg_2019_2018, 
         Avg_2013_2014, Difference)

rainbow_df <- df_compressed

# save the data frame
saveRDS(rainbow_df, file = "rainbow_df.rds")
write.csv(rainbow_df, "rainbow_df.csv")

The economist democracy scores

# 3. The Economist: Democracy scores 2018 ---------------------------------
# https://enperspectiva.uy/wp-content/uploads/2019/01/Democracy_Index_2018.pdf
democracy_scores <- data.frame(
  Country = c("Belgium", "Denmark", "Greece", "Spain", "Finland", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
  ISO2 = c("BE", "DK", "GR", "ES", "FI", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
  Overall_score = c(7.78, 9.22, 7.29, 8.08, 9.14, 7.80, 9.15, 7.71, 8.81, 8.89, 8.29, 7.84, 9.39, 8.68, 8.53, 7.03, 7.59, 7.69, 7.97, 6.63, 7.38, 7.50, 8.21, 6.67, 6.38, 7.10, 7.50, 6.57),
  #Global_rank = c(31, 5, 39, 19, 8, 29, "6=", 33, 12, 11, 16, 27, 3, 13, 14, 46, 35, 34, "23=", "57=", 38, "36=", 18, "54=", "66=", 44, "36=", 60),
  #Regional_rank = c(17, 4, 20, 14, 6, 16, 5, 18, 9, 8, 12, 15, 3, 10, 11, 7, 19, 2, 1, 9, 5, "3=", 13, 8, 12, 6, "3=", 10),
  Electoral_process_and_pluralism = c(9.58, 10.00, 9.58, 9.17, 10.00, 9.58, 9.58, 9.58, 10.00, 9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.17, 9.17, 9.58, 9.58, 8.75, 9.58, 9.58, 9.17, 9.17, 9.17, 9.58, 9.58, 9.17),
  Functioning_of_government = c(8.93, 9.29, 5.36, 7.14, 8.93, 7.50, 7.86, 6.07, 8.93, 9.29, 7.86, 7.50, 9.64, 8.57, 7.50, 6.43, 6.43, 6.79, 8.21, 6.07, 6.07, 6.43, 8.21, 6.07, 5.71, 6.79, 6.79, 6.07),
  Political_participation = c(5.00, 8.33, 6.11, 7.78, 8.33, 7.78, 8.33, 7.78, 6.67, 8.33, 8.33, 6.11, 8.33, 8.33, 8.33, 7.22, 6.67, 6.67, 6.67, 5.00, 5.56, 6.11, 6.11, 6.11, 5.00, 5.56, 6.67, 5.56),
  Political_culture = c(6.88, 9.38, 6.88, 7.50, 8.75, 5.63, 10.00, 6.88, 8.75, 8.13, 6.88, 6.88, 10.00, 7.50, 8.13, 4.38, 6.88, 6.88, 6.88, 6.25, 6.88, 6.25, 8.75, 4.38, 4.38, 5.63, 6.25, 5.00),
  Civil_liberties = c(8.53, 9.12, 8.53, 8.82, 9.71, 8.53, 10.00, 8.24, 9.71, 9.12, 8.82, 9.12, 9.41, 9.41, 9.12, 7.94, 8.82, 8.53, 8.53, 7.06, 8.82, 9.12, 8.82, 7.65, 7.65, 7.94, 8.24, 7.06),
  Regime_type = c("Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy"))

saveRDS(democracy_scores, file = "democracy_scores.rds")
write.csv(democracy_scores, "democracy_scores.csv")

World happiness report

# 4. Happiness data 2018 ---------------------------------------------------------------------
# https://s3.amazonaws.com/happiness-report/2018/WHR_web.pdf
happiness_scores <- data.frame(
  Country = c("Finland", "Denmark", "Greece", "Spain", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
  ISO2 = c("FI", "DK", "GR", "ES", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
  Happiness_Score = c(7.632, 7.555, 5.358, 6.310, 6.489, 6.977, 6.000, 6.910, 7.441, 7.139, 5.410, 7.314, 6.965, 6.814, 4.933, 5.762, 6.711, 5.739, 5.620, 5.933, 5.952, 6.627, 6.123, 5.945, 6.173, 5.948, 5.321))

saveRDS(happiness_scores, file = "happiness_scores.rds")
write.csv(happiness_scores, "happiness_scores.csv")

GDP per capita

# 5. GDP per capita ---------------------------------------------------------------------
df_GDP <- read_csv("data/raw/data_20250228194704.csv")
## New names:
## Rows: 1064 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): IndicatorName, VariableName, MeasurementName, CountryCode, Alpha3Co... dbl
## (2): IndicatorCode, PeriodCode lgl (1): ...10
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...10`
# apply the mapping to the CountryName column
df_GDP <- df_GDP %>%
  mutate(
    # replace Czechia with Czech Republic
    CountryName = ifelse(CountryName %in% names(country_name_mapping), 
                         country_name_mapping[CountryName], 
                         CountryName)) %>%
  select("CountryName", "PeriodCode", "Value") %>%
  # now filter after standardizing the names
  filter(CountryName %in% survey_country_mapping$country_name)

df_GDP <- df_GDP %>%
  mutate(Value = as.numeric(as.character(Value)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Value = as.numeric(as.character(Value))`.
## Caused by warning:
## ! NAs introduced by coercion
df_GDP <- df_GDP %>%
  # group by country
  group_by(CountryName) %>% 
  # find first and last year values
  summarize(
    gdp_2005 = Value[PeriodCode == 2005],
    gdp_2018 = Value[PeriodCode == 2018],
    # calculate relative growth
    gdp_growth = (gdp_2018 - gdp_2005) / gdp_2005 * 100) %>%
  # add ISO2 codes for easier joining with other datasets
  left_join(survey_country_mapping, by = c("CountryName" = "country_name")) %>%
  # select relevant columns
  select(CountryName, iso2, gdp_2005, gdp_2018, gdp_growth)

saveRDS(df_GDP, file = "df_GDP.rds")
write.csv(df_GDP, "df_GDP.csv")

European social survey

# 6. ESS Round 9 ----------------------------------------------------------
# https://ess.sikt.no/en/datafile/b2b0bf39-176b-4eca-8d26-3c05ea83d2cb
ess_data <- read_csv("data/raw/ESS9e03_2.csv")
## Rows: 49519 Columns: 572
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): name, proddate, cntry, ctzshipd, cntbrthd, lnghom1, lnghom2, fbrn...
## dbl (562): essround, edition, idno, dweight, pspwght, pweight, anweight, nws...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# select interesting variables, country and weight variables
ess_selected <- ess_data %>%
  select(
    # identifiers and weights
    cntry,          # country code
    pspwght,        # post-stratification weight
    dweight,        # design weight
    
    # key variables of interest
    freehms,        # gays and lesbians free to live life as they wish
    lrscale,        # left-right political scale
    rlgdgr,         # how religious are you
    ipeqopt,        # important that people are treated equally
    atchctr,        # attachment to country
    eduyrs,         # years of education
    agea)            # age of respondent

# function to recode ESS special values (negative values are typically missing values)
recode_ess_missing <- function(x) {
  ifelse(x < 0, NA, x)
}

# clean the data
ess_clean <- ess_selected %>%
  # recode special values to NA
  mutate(across(c(freehms, lrscale, rlgdgr, ipeqopt, atchctr, eduyrs, agea), 
                recode_ess_missing)) %>%
  # create derived variables if needed
  mutate(
    # recode freehms to 0-1 scale (originally 1-5 where 1 = agree strongly)
    freehms_support = case_when(
      freehms %in% c(1, 2) ~ 1,  # agree and strongly agree
      freehms %in% c(3, 4, 5) ~ 0,  # neutral, disagree, strongly disagree
      TRUE ~ NA_real_),
    
    # create age groups
    age_group = case_when(
      agea < 35 ~ "18-34",
      agea < 55 ~ "35-54",
      TRUE ~ "55+"
    ),
    
    # standardise left-right scale to 0-1
    lrscale_std = (lrscale - 1) / 9,  # Original scale is 1-10
    
    # create high education indicator (above country median)
    high_educ = NA  # will fill this in after calculating country medians
  )

# calculate country median education for relative education measure
country_medians <- ess_clean %>%
  group_by(cntry) %>%
  summarize(median_educ = median(eduyrs, na.rm = TRUE))

# join back to main data and create high education indicator
ess_clean <- ess_clean %>%
  left_join(country_medians, by = "cntry") %>%
  mutate(high_educ = ifelse(eduyrs > median_educ, 1, 0))

# calculate weighted means by country
country_aggregates <- ess_clean %>%
  # group by country
  group_by(cntry) %>%
  # calculate weighted statistics
  summarize(
    # sample size
    n_respondents = n(),
    n_valid = sum(!is.na(freehms)),
    
    # weighted means
    pct_lgbt_support = weighted.mean(freehms_support, w = pspwght, na.rm = TRUE) * 100,
    mean_religiosity = weighted.mean(rlgdgr, w = pspwght, na.rm = TRUE),
    mean_left_right = weighted.mean(lrscale_std, w = pspwght, na.rm = TRUE),
    mean_equal_values = weighted.mean(ipeqopt, w = pspwght, na.rm = TRUE),
    mean_country_attach = weighted.mean(atchctr, w = pspwght, na.rm = TRUE),
    mean_eduyrs = weighted.mean(eduyrs, w = pspwght, na.rm = TRUE),
    mean_age = weighted.mean(agea, w = pspwght, na.rm = TRUE),
    
    # weighted proportions for categorical variables
    pct_young = weighted.mean(age_group == "18-34", w = pspwght, na.rm = TRUE) * 100,
    pct_high_educ = weighted.mean(high_educ, w = pspwght, na.rm = TRUE) * 100,
    
    # standard errors (for confidence intervals)
    se_lgbt_support = sd(freehms_support, na.rm = TRUE) / sqrt(sum(!is.na(freehms_support))),
    
    # missing data proportions
    pct_missing_lgbt = mean(is.na(freehms)) * 100)

# calculate cross-variable country indicators
country_indicators <- ess_clean %>%
  group_by(cntry) %>%
  summarize(
    # correlation between age and LGBT support within country
    age_lgbt_corr = cor(agea, freehms_support, use = "pairwise.complete.obs", method = "spearman"),
    
    # correlation between religiosity and LGBT support
    relig_lgbt_corr = cor(rlgdgr, freehms_support, use = "pairwise.complete.obs", method = "spearman"),
    
    # inequality in LGBT support (standard deviation)
    lgbt_support_inequality = sd(freehms_support, na.rm = TRUE),
    
    # education gradient in LGBT support (difference between high and low education)
    educ_gradient = weighted.mean(freehms_support[high_educ == 1], w = pspwght[high_educ == 1], na.rm = TRUE) - 
      weighted.mean(freehms_support[high_educ == 0], w = pspwght[high_educ == 0], na.rm = TRUE))

# join the aggregates and indicators
country_data_final <- country_aggregates %>%
  left_join(country_indicators, by = "cntry") %>%
  # create ISO country codes for easier merging with other datasets
  mutate(
    iso2c = countrycode(cntry, "iso2c", "iso2c"),
    iso3c = countrycode(cntry, "iso2c", "iso3c"))

# plot LGBT support by country
ggplot(country_data_final, aes(x = reorder(cntry, pct_lgbt_support), y = pct_lgbt_support)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_errorbar(aes(ymin = pct_lgbt_support - 1.96*se_lgbt_support, 
                    ymax = pct_lgbt_support + 1.96*se_lgbt_support), 
                width = 0.2) +
  labs(title = "Support for LGBT Rights by Country",
       subtitle = "Percent agreeing gays and lesbians should be free to live as they wish",
       x = "Country",
       y = "Support (%)") +
  theme_minimal() +
  coord_flip()

# examine relationship between religiosity and LGBT support
ggplot(country_data_final, aes(x = mean_religiosity, y = pct_lgbt_support, label = cntry)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_text(hjust = -0.3, vjust = 0.3) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Religiosity vs. LGBT Support by Country",
       x = "Mean Religiosity Score",
       y = "LGBT Support (%)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

saveRDS(country_data_final, file = "country_data_final.rds")
write.csv(country_data_final, "country_data_final.csv")

Unemployment rates

# 7. Unemployment rate ----------------------------------------------------
# https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate

# scrape data from Wikipedia
url <- "https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate"
page <- read_html(url)
tables <- html_table(page, fill = TRUE)
eu_unemployment_table <- tables[[1]]

# clean column names - simplify them to more standard names
colnames(eu_unemployment_table) <- c("Country", "Unemployment", "Employment", "Year")

# clean up the country names by removing footnote references
eu_unemployment_table$Country <- gsub("\\[.*?\\]", "", eu_unemployment_table$Country)

# make sure numeric columns are properly formatted
eu_unemployment_table$Unemployment <- as.character(eu_unemployment_table$Unemployment)
eu_unemployment_table$Employment <- as.numeric(as.character(eu_unemployment_table$Employment))
eu_unemployment_table$Year <- as.numeric(as.character(eu_unemployment_table$Year))

eu_unemployment_table <- eu_unemployment_table %>%
  mutate(
    # trim spaces from country names
    Country = trimws(Country),
    # convert Unemployment to numeric (remove any % signs or spaces if present)
    Unemployment = as.numeric(gsub("[^0-9.]", "", Unemployment)))

# modify Greece's unemployment rate to the 2018 value
eu_unemployment_table$Unemployment[eu_unemployment_table$Country == "Greece"] <- 19.18
eu_unemployment_table$Year[eu_unemployment_table$Country == "Greece"] <- 2018

# add the UK data manually
uk_data <- data.frame(
  Country = "United Kingdom",
  Unemployment = 4.12, # from Statista
  Employment = 75.25,  # estimated from https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/articles/singlemonthlabourforcesurveyestimates/december2018
  Year = 2018)

# append UK data to the table
eu_unemployment_table <- rbind(eu_unemployment_table, uk_data)

# ignore the year column
eu_unemployment_table <- eu_unemployment_table %>%
  select(-Year)

# save
saveRDS(eu_unemployment_table, file = "eu_unemployment_table.rds")
write.csv(eu_unemployment_table, "eu_unemployment_table.csv", row.names = FALSE)

Gender equality

# 8. Gender equality ------------------------------------------------------
# https://eige.europa.eu/gender-statistics/dgs/indicator/index__index_scores/datatable?time=2017&col=domain&row=geo
gender_eq_index <- read_xlsx("data/raw/index__index_scores.xlsx", range = "A16:V44")

gender_eq_index <- gender_eq_index %>%
  select(country_name = "Geographic region\\(Sub-) Domain Scores", 
         gender_equality_index = "Overall Gender Equality Index") %>%
  mutate(country_name = ifelse(country_name == "Czechia", "Czech Republic", country_name))

# Merge the data ----------------------------------------------------------
# create a base dataframe with country identifying variables
country_level_df <- survey_country_mapping

# merge V-Dem data
country_level_df <- country_level_df %>%
  left_join(vdem_2019, by = c("country_name" = "country_name"))

# fix country name in democracy_scores for Czech Republic if needed
if(any(democracy_scores$Country == "Czechia")) {
  democracy_scores$Country[democracy_scores$Country == "Czechia"] <- "Czech Republic"
}

# merge democracy scores
country_level_df <- country_level_df %>%
  left_join(democracy_scores, by = c("iso2" = "ISO2"))

# merge GDP data
country_level_df <- country_level_df %>%
  left_join(df_GDP, by = "iso2")

# rename some countries to match our country_name format
rainbow_country_mapping <- data.frame(
  original = c("Czechia", "Andorra", "Bosnia & Herzegovina", "North Macedonia", "United Kingdom"),
  standardized = c("Czech Republic", "Andorra", "Bosnia and Herzegovina", "Macedonia", "United Kingdom"),
  stringsAsFactors = FALSE)

# apply standardized country names
for(i in 1:nrow(rainbow_country_mapping)) {
  rainbow_df$Country[rainbow_df$Country == rainbow_country_mapping$original[i]] <- 
    rainbow_country_mapping$standardized[i]
}

# extract and rename rainbow map variables for clarity
rainbow_data_clean <- rainbow_df %>%
  select(
    Country,
    rainbow_score_2019 = Value_2019,
    rainbow_score_2018 = Value_2018,
    rainbow_score_avg_2019_2018 = Avg_2019_2018,
    rainbow_score_avg_2013_2014 = Avg_2013_2014,
    rainbow_score_difference = Difference)

# merge Rainbow Map data
country_level_df <- country_level_df %>%
  left_join(rainbow_data_clean, by = c("country_name" = "Country"))

# merge happiness data
country_level_df <- country_level_df %>%
  left_join(happiness_scores, by = c("iso2" = "ISO2"))

# merge unemployment data
country_level_df <- country_level_df %>%
  left_join(eu_unemployment_table, by = c("country_name" = "Country"))

# create a mapping between ESS country codes and ISO2
ess_country_mapping <- data.frame(
  cntry = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR", 
            "DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO", 
            "SK", "SI", "ES", "SE", "GB"),
  iso2 = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR", 
           "DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO", 
           "SK", "SI", "ES", "SE", "GB"),
  stringsAsFactors = FALSE)

# first ensure cntry codes match our iso2 codes
country_data_final <- country_data_final %>%
  left_join(ess_country_mapping, by = "cntry") %>%
  select(-iso2c, -iso3c) # remove original ISO codes to avoid confusion

# rename variables for clarity
ess_data_clean <- country_data_final %>%
  select(
    cntry,
    n_respondents,
    n_valid,
    lgbt_support_percent = pct_lgbt_support,
    mean_religiosity,
    mean_left_right,
    mean_equal_values,
    mean_country_attach,
    mean_eduyrs,
    mean_age,
    pct_young,
    pct_high_educ,
    se_lgbt_support,
    pct_missing_lgbt,
    age_lgbt_corr,
    relig_lgbt_corr,
    lgbt_support_inequality,
    educ_gradient,
    #iso3c
  )

# merge ESS data
country_level_df <- country_level_df %>%
  left_join(ess_data_clean, by = c("iso2" = "cntry"))

country_level_df %>% View()

# delete the first Greece row
greece_rows <- which(country_level_df$country_name == "Greece")

if (length(greece_rows) > 1) {
  # remove the first instance of Greece
  country_level_df <- country_level_df[-greece_rows[1], ]
  
  # verify the fix worked
  greece_check <- country_level_df %>%
    filter(country_name == "Greece")
  print("Greece entries after removing the first instance:")
  print(greece_check)
}

# add the gender equality index
country_level_df <- country_level_df %>%
  left_join(gender_eq_index, by = "country_name")

Scaling

We’ll apply different scaling methods based on the type of variable:

  • z-score standardization: for continuous variables to make them comparable (mean=0, sd=1)
  • min-max normalization: for variables with natural bounds (e.g., 0-100 scores)
  • log transformation: for heavily skewed variables like GDP
# custom function to standardize (z-score)
standardize <- function(x) {
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

# custom function to min-max normalize
normalize <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

# z-scores for the democracy scores
country_level_df <- country_level_df %>%
  mutate(
    z_v2x_libdem = standardize(v2x_libdem),
    z_v2x_polyarchy = standardize(v2x_polyarchy),
    z_v2x_gender = standardize(v2x_gender),
    z_v2x_egaldem = standardize(v2x_egaldem),
    z_v2x_liberal = standardize(v2x_liberal),
    z_v2xcs_ccsi = standardize(v2xcs_ccsi),
    z_v2x_freexp = standardize(v2x_freexp))

# z-scores for GDP and unemployment rate, natural log for GDP
country_level_df <- country_level_df %>%
  mutate(
    z_gdp_2018 = standardize(gdp_2018),
    log_gdp_2018 = log(gdp_2018),
    z_gdp_growth = standardize(gdp_growth),
    z_unemployment = standardize(Unemployment))

# z-scores and normalised rainbow variables 
country_level_df <- country_level_df %>%
  mutate(
    z_rainbow_score = standardize(rainbow_score_2019),
    norm_rainbow_score = normalize(rainbow_score_2019),
    z_lgbt_support = standardize(lgbt_support_percent),
    norm_lgbt_support = normalize(lgbt_support_percent))

# z-socres and normalised for happiness scores and gender equality
country_level_df <- country_level_df %>%
  mutate(
    z_happiness = standardize(Happiness_Score),
    z_gender_equality = standardize(gender_equality_index),
    norm_gender_equality = normalize(gender_equality_index))

# religion, politics, education and age
country_level_df <- country_level_df %>%
  mutate(
    z_religiosity = standardize(mean_religiosity),
    z_functioning_of_government = standardize(Functioning_of_government),
    z_left_right = standardize(mean_left_right),
    z_equal_values = standardize(mean_equal_values),
    z_country_attach = standardize(mean_country_attach),
    z_eduyrs = standardize(mean_eduyrs),
    z_age = standardize(mean_age))

# create two composite scores and regional classification
country_level_df <- country_level_df %>%
  mutate(
    composite_equality = rowMeans(
      cbind(z_gender_equality, z_rainbow_score, z_v2x_gender),
      na.rm = TRUE))

country_level_df <- country_level_df %>%
  mutate(
    composite_democracy = rowMeans(
      cbind(z_v2x_libdem, z_v2x_polyarchy, z_v2x_libdem, z_v2x_freexp), 
      na.rm = TRUE))

country_level_df <- country_level_df %>%
  mutate(
    region = case_when(
      country_name %in% c("Denmark", "Finland", "Sweden", "Estonia", "Latvia", "Lithuania") ~ 
        "Northern Europe",
      country_name %in% c("Belgium", "Netherlands", "Luxembourg", "Germany", "France", 
                          "Austria", "United Kingdom", "Ireland") ~ 
          "Western Europe",
       country_name %in% c("Portugal", "Spain", "Italy", "Malta", "Greece", "Cyprus") ~ 
        "Southern Europe",
      country_name %in% c("Poland", "Czech Republic", "Slovakia", "Hungary", "Slovenia", 
                          "Croatia", "Romania", "Bulgaria") ~           "Eastern Europe",
        TRUE ~ NA_character_))

# delete the "Country.x", "Country.y" and "CountryName" columns
country_level_df <- country_level_df %>%
  select(-c("Country.x", "Country.y", "CountryName"))

# save as RDS and csv
saveRDS(country_level_df, file = "country_level_df.rds")
write.csv(country_level_df, "country_level_df.csv")

Non response analysis

data <- read_dta("data/raw/ZA7575.dta")

# create dataset with non-response indicator
non_numeric_vars <- c("qc19", "studyno1", "studyno2", "doi", "version",
                      "edition", "survey", "caseid", "uniqid", "serialid",
                      "tnscntry", "country")

# use the data that is already correctly coded
data_correctly_coded_nonresp_analysis <- data %>%
  mutate(
    # hard cases for me
    d72_1 = ifelse(d72_1 %in% c(5,6), NA, d72_1),
    d72_2 = ifelse(d72_2 %in% c(5,6), NA, d72_2),
    d60 = ifelse(d60 == 7, NA, d60),
    d25 = ifelse(d25 == 8, NA, d25),
    d8 = ifelse(d8 > 70, NA, d8), # subjective decision that no one can have more than 70 years of ed (even full-time professors)
    d7 = ifelse(d7 %in% c(15,97), NA, d7),
    d1 = ifelse(d1 %in% c(97,98), NA, d1),
    
    qa16_1 = ifelse(qa16_1 == 5, NA, qa16_1),
    qa16_2 = ifelse(qa16_2 == 5, NA, qa16_2),
    qa16_3 = ifelse(qa16_3 == 5, NA, qa16_3),
    qa16_4 = ifelse(qa16_4 == 5, NA, qa16_4),
    
    d71_1 = ifelse(d71_1 == 4, NA, d71_1),
    d71_2 = ifelse(d71_2 == 4, NA, d71_2),
    d71_3 = ifelse(d71_3 == 4, NA, d71_3),
    #qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
    #qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
    #qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
    #qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
    
    qb3_1 = ifelse(qb3_1 == 5, NA, qb3_1),
    qb3_2 = ifelse(qb3_2 == 5, NA, qb3_2),
    qb3_3 = ifelse(qb3_3 == 5, NA, qb3_3),
    qb3_4 = ifelse(qb3_4 == 5, NA, qb3_4),
    qb3_5 = ifelse(qb3_5 == 5, NA, qb3_5),
    qb3_6 = ifelse(qb3_6 == 5, NA, qb3_6),
    qb3_7 = ifelse(qb3_7 == 5, NA, qb3_7),
    
    qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
    qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
    qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
    qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
    qb4_5 = ifelse(qb4_5 == 5, NA, qb4_5),
    
    qb5_1 = ifelse(qb5_1 == 5, NA, qb5_1),
    qb5_2 = ifelse(qb5_2 == 5, NA, qb5_2),
    qb5_3 = ifelse(qb5_3 == 5, NA, qb5_3),
    qb5_4 = ifelse(qb5_4 == 5, NA, qb5_4),
    
    sd1_1 = ifelse(sd1_1 %in% c(3,4), NA, sd1_1),
    sd1_2 = ifelse(sd1_2 %in% c(3,4), NA, sd1_2),
    sd1_3 = ifelse(sd1_3 %in% c(3,4), NA, sd1_3),
    sd1_4 = ifelse(sd1_4 %in% c(3,4), NA, sd1_4),
    sd1_5 = ifelse(sd1_5 %in% c(3,4), NA, sd1_5),
    sd1_6 = ifelse(sd1_6 %in% c(3,4), NA, sd1_6),
    sd1_7 = ifelse(sd1_7 %in% c(3,4), NA, sd1_7),
    sd1_8 = ifelse(sd1_8 %in% c(3,4), NA, sd1_8),
    
    qc1_1 = ifelse(qc1_1 == 6, NA, qc1_1),
    qc1_2 = ifelse(qc1_2 == 6, NA, qc1_2),
    qc1_3 = ifelse(qc1_3 == 6, NA, qc1_3),
    qc1_4 = ifelse(qc1_4 == 6, NA, qc1_4),
    qc1_5 = ifelse(qc1_5 == 6, NA, qc1_5),
    qc1_6 = ifelse(qc1_6 == 6, NA, qc1_6),
    qc1_7 = ifelse(qc1_7 == 6, NA, qc1_7),
    qc1_8 = ifelse(qc1_8 == 6, NA, qc1_8),
    qc1_9 = ifelse(qc1_9 == 6, NA, qc1_9),
    qc1_10 = ifelse(qc1_10 == 6, NA, qc1_10),
    
    qc5_1 = ifelse(qc5_1 == 3, NA, qc5_1),
    qc5_2 = ifelse(qc5_2 == 3, NA, qc5_2),
    qc5_3 = ifelse(qc5_3 == 3, NA, qc5_3),
    qc5_4 = ifelse(qc5_4 == 3, NA, qc5_4),
    
    qc6_1 = ifelse(qc6_1 == 12, NA, qc6_1),
    qc6_2 = ifelse(qc6_2 == 12, NA, qc6_2),
    qc6_3 = ifelse(qc6_3 == 12, NA, qc6_3),
    qc6_4 = ifelse(qc6_4 == 12, NA, qc6_4),
    qc6_5 = ifelse(qc6_5 == 12, NA, qc6_5),
    qc6_6 = ifelse(qc6_6 == 12, NA, qc6_6),
    qc6_7 = ifelse(qc6_7 == 12, NA, qc6_7),
    qc6_8 = ifelse(qc6_8 == 12, NA, qc6_8),
    qc6_9 = ifelse(qc6_9 == 12, NA, qc6_9),
    qc6_10 = ifelse(qc6_10 == 12, NA, qc6_10),
    qc6_11 = ifelse(qc6_11 == 12, NA, qc6_11),
    
    qc9_1 = ifelse(qc9_1 == 7, NA, qc9_1),
    qc9_2 = ifelse(qc9_2 == 7, NA, qc9_2),
    qc9_3 = ifelse(qc9_3 == 7, NA, qc9_3),
    qc9_4 = ifelse(qc9_4 == 7, NA, qc9_4),
    qc9_5 = ifelse(qc9_5 == 7, NA, qc9_5),
    qc9_6 = ifelse(qc9_6 == 7, NA, qc9_6),
    qc9_7 = ifelse(qc9_7 == 7, NA, qc9_7),
    qc9_8 = ifelse(qc9_8 == 7, NA, qc9_8),
    qc9_9 = ifelse(qc9_9 == 7, NA, qc9_9),
    qc9_10 = ifelse(qc9_10 == 7, NA, qc9_10),
    qc9_11 = ifelse(qc9_11 == 7, NA, qc9_11),
    
    qc11_1 = ifelse(qc11_1 == 6, NA, qc11_1),
    qc11_2 = ifelse(qc11_2 == 6, NA, qc11_2),
    qc11_3 = ifelse(qc11_3 == 6, NA, qc11_3),
    qc11_4 = ifelse(qc11_4 == 6, NA, qc11_4),
    qc11_5 = ifelse(qc11_5 == 6, NA, qc11_5),
    qc11_6 = ifelse(qc11_6 == 6, NA, qc11_6),
    
    qc12_1 = ifelse(qc12_1 %in% c(11,12,13), NA, qc12_1),
    qc12_2 = ifelse(qc12_2 %in% c(11,12,13), NA, qc12_2),
    qc12_3 = ifelse(qc12_3 %in% c(11,12,13), NA, qc12_3),
    qc12_4 = ifelse(qc12_4 %in% c(11,12,13), NA, qc12_4),
    qc12_5 = ifelse(qc12_5 %in% c(11,12,13), NA, qc12_5),
    qc12_6 = ifelse(qc12_6 %in% c(11,12,13), NA, qc12_6),
    qc12_7 = ifelse(qc12_7 %in% c(11,12,13), NA, qc12_7),
    qc12_8 = ifelse(qc12_8 %in% c(11,12,13), NA, qc12_8),
    qc12_9 = ifelse(qc12_9 %in% c(11,12,13), NA, qc12_9),
    qc12_10 = ifelse(qc12_10 %in% c(11,12,13), NA, qc12_10),
    qc12_11 = ifelse(qc12_11 %in% c(11,12,13), NA, qc12_11),
    qc12_12 = ifelse(qc12_12 %in% c(11,12,13), NA, qc12_12),
    qc12_13 = ifelse(qc12_13 %in% c(11,12,13), NA, qc12_13),
    qc12_14 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_14),
    qc12_15 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_15),
    
    qc13_1 = ifelse(qc13_1 %in% c(11,12,13), NA, qc13_1),
    qc13_2 = ifelse(qc13_2 %in% c(11,12,13), NA, qc13_2),
    qc13_3 = ifelse(qc13_3 %in% c(11,12,13), NA, qc13_3),
    qc13_4 = ifelse(qc13_4 %in% c(11,12,13), NA, qc13_4),
    qc13_5 = ifelse(qc13_5 %in% c(11,12,13), NA, qc13_5),
    qc13_6 = ifelse(qc13_6 %in% c(11,12,13), NA, qc13_6),
    qc13_7 = ifelse(qc13_7 %in% c(11,12,13), NA, qc13_7),
    qc13_8 = ifelse(qc13_8 %in% c(11,12,13), NA, qc13_8),
    qc13_9 = ifelse(qc13_9 %in% c(11,12,13), NA, qc13_9),
    qc13_10 = ifelse(qc13_10 %in% c(11,12,13), NA, qc13_10),
    qc13_11 = ifelse(qc13_11 %in% c(11,12,13), NA, qc13_11),
    qc13_12 = ifelse(qc13_12 %in% c(11,12,13), NA, qc13_12),
    qc13_13 = ifelse(qc13_13 %in% c(11,12,13), NA, qc13_13),
    qc13_14 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_14),
    qc13_15 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_15),
    
    qc15_1 = ifelse(qc15_1 == 5, NA, qc15_1),
    qc15_2 = ifelse(qc15_2 == 5, NA, qc15_2),
    qc15_3 = ifelse(qc15_3 == 5, NA, qc15_3),
    
    qc16_1 = ifelse(qc16_1 == 5, NA, qc16_1),
    
    qc17_1 = ifelse(qc17_1 == 5, NA, qc17_1),
    qc17_2 = ifelse(qc17_2 == 5, NA, qc17_2),
    qc17_3 = ifelse(qc17_3 == 5, NA, qc17_3),
    qc17_4 = ifelse(qc17_4 == 5, NA, qc17_4),
    qc17_5 = ifelse(qc17_5 == 5, NA, qc17_5),
    qc17_6 = ifelse(qc17_6 == 5, NA, qc17_6),
    qc17_7 = ifelse(qc17_7 == 5, NA, qc17_7),
    
    qc18_1 = ifelse(qc18_1 %in% c(11,12), NA, qc18_1),
    qc18_2 = ifelse(qc18_2 %in% c(11,12), NA, qc18_2),
    qc18_3 = ifelse(qc18_3 %in% c(11,12), NA, qc18_3),
    
    sd3 = ifelse(sd3 %in% c(15,16), NA, sd3),
    
    # easy cases for Claude
    #q1 = ifelse(q1 %in% c(29,30), NA, q1),
    
    qa1 = ifelse(qa1 == 5, NA, qa1),
    
    qa7 = ifelse(qa7 == 5, NA, qa7),
    
    qa8 = ifelse(qa8 == 4, NA, qa8),
    
    qa9 = ifelse(qa9 == 6, NA, qa9),
    
    qa14 = ifelse(qa14 == 6, NA, qa14),
    
    qa17 = ifelse(qa17 == 5, NA, qa17),
    
    qb6 = ifelse(qb6 == 4, NA, qb6),
    
    qb7 = ifelse(qb7 == 5, NA, qb7),
    
    qc19 = ifelse(qc19 == 3, NA, qc19),
    
    qc20 = ifelse(qc20 == 3, NA, qc20),
    
    d63 = ifelse(d63 %in% c(6,7,8,9), NA, d63), # manually because stupid Claude
    
    d77 = ifelse(d77 == 5, NA, d77),
    
    d70 = ifelse(d70 == 5, NA, d70),
    
    qa4a = ifelse(qa4a %in% c(7,8), NA, qa4a),
    
    qa5a = ifelse(qa5a %in% c(11,12,13), NA, qa5a),
    
    qa11 = ifelse(qa11 %in% c(4,5), NA, qa11),
    
    qa12 = ifelse(qa12 %in% c(5,6), NA, qa12),
    
    qa13 = ifelse(qa13 %in% c(6,7), NA, qa13),
    
    qa18a = ifelse(qa18a %in% c(7,8,9), NA, qa18a),
    
    qb8 = ifelse(qb8 == 5, NA, qb8),
    
    sd3 = ifelse(sd3 == 16, NA, sd3),
    
    qc3 = ifelse(qc3 %in% c(10,11), NA, qc3),
    
    qc7 = ifelse(qc7 %in% c(11,12), NA, qc7),
    
    qc8 = ifelse(qc8 %in% c(11,12), NA, qc8),
    
    qc10 = ifelse(qc10 %in% c(9,10), NA, qc10))

nonresp_analysis <- data_correctly_coded_nonresp_analysis %>% 
  mutate(nonresponse = ifelse(is.na(qc19), 1, 0)) %>% 
  select(-all_of(non_numeric_vars))

## calculate correlations with target's NAs
# start by comparing to countries
nonresp_by_country <- nonresp_analysis %>% 
  group_by(isocntry) %>% 
  summarise(
    nonresp_count = sum(nonresponse),
    total_resp = n(),
    nonresp_pct = nonresp_count/total_resp*100)

# visualize country variation
ggplot(nonresp_by_country, 
       aes(x = reorder(isocntry, -nonresp_pct), y = nonresp_pct)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Non-Response Rates by Country",
    x = "Country Code",
    y = "Non-Response Percentage")

# identify likely categorical variables (those with few unique values)
var_uniqueness <- sapply(nonresp_analysis, function(x) {
  if(is.numeric(x)) {
    length(unique(x))
  } else {
    NA
  }
})

# consider variables with 10 or fewer unique values as potential categorical variables
likely_categorical <- names(var_uniqueness[!is.na(var_uniqueness) & var_uniqueness < 6])
likely_categorical <- setdiff(likely_categorical, c("nonresponse", "qc19"))

# 
chi_square_results <- data.frame(variable = character(), 
                                 p_value = numeric(),
                                 stringsAsFactors = FALSE)

for (var in likely_categorical) {
  # Create contingency table
  cont_table <- table(nonresp_analysis$nonresponse, nonresp_analysis[[var]])
  # Calculate chi-square test
  chi_test <- chisq.test(cont_table)
  # Store result
  chi_square_results <- rbind(chi_square_results, 
                              data.frame(variable = var, p_value = chi_test$p.value))
}
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
# Show significant associations
chi_square_results %>% 
  filter(p_value < 0.05) %>%
  arrange(p_value)
##     variable       p_value
## 1      q1_30  0.000000e+00
## 2       eu28  0.000000e+00
## 3    qc6_10r 2.044685e-290
## 4       qc7r 4.886901e-252
## 5    qc6_11r 1.407293e-250
## 6     qc6_2r 4.356071e-225
## 7    qc18_2r 1.612896e-203
## 8    qc18_3r 8.893443e-200
## 9     qc11_6 8.014728e-181
## 10     qc14r 1.431519e-177
## 11    qc11_4 4.128275e-164
## 12    qc11_1 3.882881e-163
## 13    qc11_2 2.545518e-162
## 14    qc11_3 3.055025e-159
## 15    qc11_5 4.986443e-158
## 16   qa5t_13 1.698969e-141
## 17    qa10_6 1.265817e-139
## 18      qc8r 2.025190e-138
## 19    qc4_18 3.460117e-132
## 20     qb2_2 2.610051e-115
## 21    qc6_4r 3.079972e-115
## 22    qc6_3r 2.904656e-114
## 23   qc18_1r 1.158141e-107
## 24      d62t 2.685434e-107
## 25    qc6_5r 1.268464e-106
## 26    qc6_8r  2.092497e-99
## 27    qc6_9r  2.593222e-98
## 28      d1r1  1.385848e-96
## 29    qc6_6r  3.672115e-88
## 30    qc6_7r  9.272870e-88
## 31     qb2_4  1.225998e-84
## 32     sd1_4  1.123999e-82
## 33    qb9_12  1.923772e-81
## 34    qa4t_8  2.088283e-79
## 35    qc15_3  2.277333e-77
## 36    qc15_2  1.051620e-73
## 37    qc6_1r  1.604390e-71
## 38     qa6_8  1.362904e-70
## 39     qb2_1  2.246904e-69
## 40     qb2_3  1.393748e-67
## 41        p5  9.708885e-67
## 42    qb1_12  7.900936e-63
## 43     q1_26  1.288093e-62
## 44  eu_nms12  3.913407e-62
## 45     qa5t1  3.665003e-61
## 46     sd1_2  6.049019e-59
## 47  euronz13  1.062676e-52
## 48      eu15  5.168859e-52
## 49  eu_nms13  5.168859e-52
## 50   qa18t_8  2.515989e-51
## 51       qb6  1.748491e-49
## 52    qc4t_2  8.497680e-48
## 53      opls  1.413440e-46
## 54  euronz15  1.372073e-45
## 55   euroz13  1.405807e-44
## 56    qc15_1  1.952760e-44
## 57     d11r1  2.265525e-44
## 58    qc4t_1  1.194079e-43
## 59      gen3  2.756845e-42
## 60    eu12_1  4.517978e-42
## 61      gen2  5.559090e-41
## 62    qa4b_8  9.351894e-41
## 63   qa18t_4  2.452999e-40
## 64   euroz15  1.183887e-37
## 65    eu_ac2  1.793942e-37
## 66      qc20  3.218264e-37
## 67    eu12_2  9.452635e-37
## 68     qc5_2  1.037711e-36
## 69     qc5_1  2.121594e-34
## 70   qa18t_9  2.096787e-33
## 71     qc4_1  6.531364e-33
## 72    qc4t_3  1.531475e-32
## 73  euronz16  2.178984e-32
## 74   qa5b_13  3.544533e-32
## 75      eu10  4.714907e-32
## 76       qa1  5.400879e-31
## 77       d77  1.413699e-30
## 78  eurnz18c  4.786919e-30
## 79      d43t  7.120921e-30
## 80    qc4_13  1.816966e-29
## 81     qc5_4  1.960980e-29
## 82       eu9  4.872873e-28
## 83     qb9_7  2.582998e-27
## 84   qa18t_5  5.199612e-27
## 85     qa3_7  1.647446e-26
## 86       eu6  1.765025e-26
## 87    qc4_14  2.845552e-26
## 88    qc2_16  5.917563e-26
## 89   euroz16  1.679575e-25
## 90      q1_1  2.475417e-25
## 91     qc4_4  4.637918e-25
## 92      d43b  5.058858e-25
## 93    qa5t_1  5.858261e-25
## 94   qa18b_9  1.041380e-24
## 95     qc4_5  1.271743e-24
## 96     qc2t2  2.459485e-24
## 97       d70  2.960622e-24
## 98  eu_nms10  3.100520e-24
## 99    qa4t_2  5.365959e-24
## 100    qc4_3  8.206178e-23
## 101  polintr  8.566328e-23
## 102   qc4_15  1.299652e-22
## 103   qa5t_2  4.202083e-22
## 104    d72_1  5.959396e-22
## 105    qc2t1  9.676312e-22
## 106    qc4_8  3.644844e-21
## 107    sd1_1  3.950729e-21
## 108     gen6  4.019052e-21
## 109    d71_1  5.836218e-21
## 110    sd1_7  7.422958e-21
## 111   qa5t_3  1.036522e-20
## 112    qc5_3  2.356009e-20
## 113  d40_d11  2.578091e-19
## 114 eurnz17a  3.034910e-19
## 115    d71_2  3.358979e-19
## 116   qc4_10  7.509064e-19
## 117    d72_2  1.712351e-18
## 118     eu25  3.108513e-18
## 119   eu_cc3  3.108513e-18
## 120    q1_10  4.665997e-18
## 121  qa18t_2  2.003105e-17
## 122      qb8  4.583134e-17
## 123    qb1_7  5.338023e-17
## 124   qc2t_2  6.837072e-17
## 125  d15a_r1  7.026568e-17
## 126    qb3_6  7.937370e-17
## 127   qa5t_7  1.481144e-16
## 128    qc4_9  2.086661e-16
## 129    sd2_9  9.372057e-16
## 130   qc17_3  1.118165e-15
## 131   qb1_11  2.203586e-15
## 132   qc17_2  3.325650e-15
## 133    qc2_3  3.824190e-15
## 134   qa10_2  4.623632e-15
## 135      qa8  6.209991e-15
## 136   qc2t_1  1.592234e-14
## 137    qb9_3  1.680271e-14
## 138    qc4_7  1.858493e-14
## 139    qc4_2  2.119543e-14
## 140    qb1_4  2.446559e-14
## 141 eurnz18b  2.531609e-14
## 142  euroz17  5.449408e-14
## 143 eurnz17b  5.449408e-14
## 144    q1_18  6.255103e-14
## 145   sd2_10  7.213815e-14
## 146    qb1_2  1.470240e-13
## 147  qa18b_8  2.733686e-13
## 148     qa12  3.053371e-13
## 149   qc17_5  3.382632e-13
## 150   qc17_4  4.134190e-13
## 151   qa4t_6  4.334440e-13
## 152   qa4t_3  4.975980e-13
## 153    sd1_6  8.092142e-13
## 154    qa5t2  8.925986e-13
## 155   qc2_15  1.017303e-12
## 156    q1_23  1.653697e-12
## 157    qc4_6  2.306592e-12
## 158    qb3_2  2.520115e-12
## 159    qb9_5  3.651558e-12
## 160    qb5_1  4.491434e-12
## 161 d40abc_r  4.522913e-12
## 162    qb3_4  7.085733e-12
## 163    qb3_1  7.955607e-12
## 164    qb4_2  8.896783e-12
## 165    qb9_8  1.167710e-11
## 166   qc17_6  1.590429e-11
## 167    sd1_8  1.752213e-11
## 168    qb1_8  1.957163e-11
## 169    d71_3  2.209437e-11
## 170    qa6_3  2.855182e-11
## 171     gen5  3.079030e-11
## 172    qb3_5  3.911969e-11
## 173    qb9_6  1.112472e-10
## 174    qb5_2  2.270824e-10
## 175    qb4_4  3.276973e-10
## 176   p7gr_r  3.949888e-10
## 177    q1_24  4.374818e-10
## 178    qa6_6  5.266917e-10
## 179    q1_16  7.943816e-10
## 180  euroz18  9.087967e-10
## 181 eurnz18a  9.087967e-10
## 182    qb3_3  9.446784e-10
## 183    qb3_7  1.435136e-09
## 184    qb1_5  1.864389e-09
## 185  qa18b_5  2.440696e-09
## 186   qa10_3  3.507219e-09
## 187   qa5t_6  3.992039e-09
## 188    sd2_5  4.884312e-09
## 189   qc17_1  7.380343e-09
## 190    qa6_1  7.512216e-09
## 191      qb7  8.344996e-09
## 192    qb1_1  8.450203e-09
## 193   qc2t_3  9.265775e-09
## 194     gen4  9.906682e-09
## 195   qa10_4  1.858086e-08
## 196    qb5_4  2.126871e-08
## 197   qa4t_4  2.503853e-08
## 198    qb5_3  2.971480e-08
## 199   qc2_13  3.365795e-08
## 200   qc4_17  3.777663e-08
## 201    qa3_3  4.663245e-08
## 202  qa18b_1  5.692094e-08
## 203   qa4b_2  7.597421e-08
## 204     sd2t  9.306867e-08
## 205   qc16_1  1.009968e-07
## 206    sd2_3  1.333819e-07
## 207   qa5t_8  1.371978e-07
## 208  qa18b_4  1.677539e-07
## 209   qc2_11  2.080382e-07
## 210   qa5b_3  3.181619e-07
## 211   qa10_5  3.471105e-07
## 212    q1_13  3.638465e-07
## 213   qc4_12  3.842966e-07
## 214     p6hr  4.821467e-07
## 215  euroz19  5.578794e-07
## 216 euronz19  5.578794e-07
## 217     p6lt  5.757985e-07
## 218   qc17_7  6.800755e-07
## 219    qa2_8  7.098682e-07
## 220    qb4_3  7.196575e-07
## 221   qc2_12  7.777325e-07
## 222   qa15_5  9.602951e-07
## 223    qb9_4  1.078878e-06
## 224      d10  1.482212e-06
## 225   d40a_r  1.528080e-06
## 226     p6ro  1.645223e-06
## 227   qa5b_1  1.688887e-06
## 228   qa5t_5  2.690016e-06
## 229   qa4t_7  2.929730e-06
## 230    qc2_4  2.996157e-06
## 231    qa2_6  3.120801e-06
## 232    qb4_1  3.238338e-06
## 233     d43a  3.411716e-06
## 234    qc2_2  4.496683e-06
## 235  nutslvl  4.874204e-06
## 236    qc2_9  6.122456e-06
## 237   qa4b_7  7.674815e-06
## 238    qa6_7  1.352228e-05
## 239   d40b_r  1.759982e-05
## 240     q1_2  2.005715e-05
## 241    qb4_5  2.015061e-05
## 242    sd2_2  2.049006e-05
## 243    qa3_4  2.589069e-05
## 244    qa2_4  2.736553e-05
## 245     p6ie  3.764434e-05
## 246    qb9_1  4.856049e-05
## 247    sd1_5  6.451635e-05
## 248   qa5t_4  6.531201e-05
## 249  eu_nms3  7.696703e-05
## 250     p7fi  1.024245e-04
## 251    qb1_9  1.208415e-04
## 252   p7ro_r  1.337295e-04
## 253      qa7  2.426730e-04
## 254    qa3_6  2.548023e-04
## 255  qa5t_12  3.089768e-04
## 256    q1_20  3.223279e-04
## 257    q1_28  4.560014e-04
## 258   qa5b_7  4.854006e-04
## 259    qa2_3  6.196268e-04
## 260     qa17  6.970119e-04
## 261    qa6_2  7.354174e-04
## 262     eu27  8.092725e-04
## 263     q1_5  8.187847e-04
## 264    q1_21  9.613980e-04
## 265     p7sk  9.944782e-04
## 266   qa15_1  1.064753e-03
## 267   qa16_2  1.352530e-03
## 268    qa2_5  1.751906e-03
## 269   qa4b_6  2.155548e-03
## 270    q1_25  2.321971e-03
## 271   qa15_3  2.400124e-03
## 272   qa16_1  2.422900e-03
## 273     q1_4  2.451687e-03
## 274    qc2_6  2.773349e-03
## 275    qc2_7  2.827257e-03
## 276   qb9_11  2.831141e-03
## 277    q1_17  2.854416e-03
## 278   qc2_10  3.152968e-03
## 279    qa3_5  3.873243e-03
## 280    qa3_1  4.025280e-03
## 281     p6fi  4.211379e-03
## 282     p6de  4.399833e-03
## 283   qa5b_2  5.148265e-03
## 284  qa5b_11  5.324307e-03
## 285   qb9_10  6.119215e-03
## 286   qc2_14  6.796681e-03
## 287  qa18t_3  6.817408e-03
## 288     q1_6  7.295408e-03
## 289    qc2_1  8.368310e-03
## 290      p13  1.063394e-02
## 291    qc2_8  1.080884e-02
## 292   qa5b_6  1.101767e-02
## 293    q1_11  1.166239e-02
## 294   qa4b_4  1.275666e-02
## 295     q1_9  1.370648e-02
## 296   qa4t_5  1.396210e-02
## 297    qb9_9  1.474250e-02
## 298   d40c_r  1.657626e-02
## 299     p6sk  1.690870e-02
## 300  qa18b_2  1.694901e-02
## 301     p6it  2.186514e-02
## 302    qa2_1  2.281163e-02
## 303   qa5b_9  2.649771e-02
## 304   qc4_11  2.671142e-02
## 305     gen1  2.673010e-02
## 306    p13lv  2.756553e-02
## 307    qb1_6  2.881619e-02
## 308     p6lv  2.911116e-02
## 309    p13fi  3.011088e-02
## 310   qa4t_1  3.177702e-02
## 311   qa10_1  3.357706e-02
## 312    sd2_6  4.182597e-02
## 313    sd2_7  4.294333e-02
##
continuous_vars <- names(var_uniqueness[!is.na(var_uniqueness) & var_uniqueness >= 6])
continuous_vars <- setdiff(continuous_vars, c("nonresponse", "qc19"))

correlation_results <- data.frame(variable = character(), 
                                  correlation = numeric(),
                                  stringsAsFactors = FALSE)

for (var in continuous_vars) {
  # Calculate correlation
  cor_val <- cor(nonresp_analysis$nonresponse, nonresp_analysis[[var]], 
                 use = "pairwise.complete.obs")
  # Store result
  correlation_results <- rbind(correlation_results, 
                               data.frame(variable = var, correlation = cor_val))
}

# Show strongest correlations
correlation_results %>% arrange(desc(abs(correlation)))
##     variable   correlation
## 1       p7ro  1.961854e-01
## 2       p7gr  1.858957e-01
## 3       p7ie  1.701746e-01
## 4     netuse  1.320891e-01
## 5      d62_1  1.302669e-01
## 6      d62_3  1.301065e-01
## 7       p7cy -1.284124e-01
## 8   qc12_11r  1.268925e-01
## 9   qc12_12r  1.267441e-01
## 10      d1r2  1.220472e-01
## 11   qc13_4r  1.200788e-01
## 12   qc13_8r  1.200647e-01
## 13    qc18_2 -1.188009e-01
## 14    qc18_3 -1.184702e-01
## 15   qc12_4r  1.119538e-01
## 16   qc12_7r  1.107519e-01
## 17  qc13_14r  1.105615e-01
## 18   qc12_8r  1.104514e-01
## 19   qc13_10 -1.102491e-01
## 20  qc12_14r  1.089403e-01
## 21   qc12_9r  1.048803e-01
## 22   qc13_11 -1.039340e-01
## 23  qc12_15r  1.029836e-01
## 24      p7pt -1.026675e-01
## 25   qc12_5r  1.015150e-01
## 26  qc12_13r  1.006697e-01
## 27      p7ee  9.993185e-02
## 28     qc1_6  9.991443e-02
## 29   qc13_12 -9.977293e-02
## 30       d11  9.818044e-02
## 31    qc9_10  9.744025e-02
## 32    qc9_11  9.733104e-02
## 33   qc13_5r  9.658877e-02
## 34     d11r3  9.652444e-02
## 35    p7pl_r -9.547155e-02
## 36       d63 -9.471330e-02
## 37   qc13_9r  9.379196e-02
## 38     qc9_5  9.374359e-02
## 39  qc13_12r  9.244527e-02
## 40  qc13_11r  9.138196e-02
## 41      qc14  9.011918e-02
## 42     d11r2  8.982005e-02
## 43   qc13_7r  8.977956e-02
## 44  qc12_10r  8.886650e-02
## 45     d62_4  8.878063e-02
## 46      qa14 -8.872688e-02
## 47   qc12_3r  8.844414e-02
## 48     qc1_9  8.806345e-02
## 49  qc13_13r  8.756854e-02
## 50     d62_2  8.737139e-02
## 51     qc1_1  8.732439e-02
## 52   qc12_6r  8.498051e-02
## 53     qc9_2  8.406519e-02
## 54     qc1_2  8.247491e-02
## 55   qc12_2r  8.210264e-02
## 56     qc9_8  7.925402e-02
## 57     qc9_3  7.922324e-02
## 58   qc13_6r  7.639419e-02
## 59     qc9_1  7.593227e-02
## 60    qc13_2 -7.521485e-02
## 61      d7r2  7.414868e-02
## 62     qc9_9  7.377205e-02
## 63     qc9_4  7.252654e-02
## 64     qc1_4  7.133556e-02
## 65  qc13_15r  7.042722e-02
## 66      p7dk -7.021620e-02
## 67     qc9_6  6.973673e-02
## 68     qc9_7  6.943869e-02
## 69     qc1_8  6.693601e-02
## 70   qc13_3r  6.605581e-02
## 71      p7it -6.550028e-02
## 72     qc1_5  6.390597e-02
## 73   p7it_r2 -6.300557e-02
## 74     qa18a -6.256839e-02
## 75   p7es_r2 -6.217328e-02
## 76   qc12_1r  6.216130e-02
## 77      p7se -6.162646e-02
## 78     qc1_7  6.122670e-02
## 79    qc1_10  6.098965e-02
## 80    qc13_1 -6.043364e-02
## 81    qc13_3 -6.036766e-02
## 82     qc1_3  6.034092e-02
## 83     qc6_2 -5.859010e-02
## 84   qc12_15  5.853208e-02
## 85       w24  5.839735e-02
## 86       w94  5.711731e-02
## 87   p7it_r1 -5.694419e-02
## 88   qc13_1r  5.660931e-02
## 89      p7si -5.650184e-02
## 90        d8 -5.571681e-02
## 91   p7es_r1 -5.461270e-02
## 92       sd3 -5.407886e-02
## 93      p7de  5.350126e-02
## 94   d15a_r2  5.342388e-02
## 95    qc6_10 -5.205970e-02
## 96       w84  5.197034e-02
## 97      p7be -5.125235e-02
## 98   qc13_2r  5.106552e-02
## 99    qc13_7 -5.051020e-02
## 100      qa9  5.022430e-02
## 101   qc6_11 -5.005109e-02
## 102   qc18_1 -4.802767e-02
## 103  qc12_10 -4.731778e-02
## 104     p7pl -4.681212e-02
## 105     p7at  4.323600e-02
## 106     d8r2 -4.268455e-02
## 107    qc6_4 -4.252304e-02
## 108      w13  4.137546e-02
## 109   qc13_8  4.093315e-02
## 110  qc12_11 -4.081044e-02
## 111   qc13_6 -3.986910e-02
## 112     d7r1  3.956674e-02
## 113  qc12_12 -3.806482e-02
## 114   qc12_8  3.785765e-02
## 115  qc13_13 -3.776671e-02
## 116   d40abc -3.759386e-02
## 117   qc12_1 -3.656406e-02
## 118     p7lv -3.623722e-02
## 119     qa5a  3.364841e-02
## 120     d15a -3.361297e-02
## 121     qc10  3.355247e-02
## 122    qc6_5 -3.322756e-02
## 123   qc13_4  3.295031e-02
## 124   qc12_4  3.224803e-02
## 125      w10 -3.215236e-02
## 126  qc12_14  3.166535e-02
## 127   p7gb_r -3.125343e-02
## 128     d40b -3.080266e-02
## 129    qc6_1  3.064529e-02
## 130     p7bg  2.995536e-02
## 131 qc13_10r  2.780521e-02
## 132     p7lt -2.708901e-02
## 133      w11 -2.669060e-02
## 134   qc12_2 -2.569431e-02
## 135       w8 -2.537512e-02
## 136     d40a -2.480309e-02
## 137       d7  2.394939e-02
## 138   qc12_3 -2.389543e-02
## 139       w9 -2.273540e-02
## 140  qc13_15  2.269340e-02
## 141     d7r3 -2.176323e-02
## 142      w30  2.139600e-02
## 143       d1  2.111394e-02
## 144       p7 -2.067377e-02
## 145      w29 -2.051203e-02
## 146  qc13_14  2.046962e-02
## 147      w81 -1.890283e-02
## 148      w82  1.867097e-02
## 149      w89 -1.861186e-02
## 150      w98 -1.827807e-02
## 151      w90  1.815757e-02
## 152      w85 -1.796005e-02
## 153    qc6_6  1.795772e-02
## 154   qc12_6 -1.784957e-02
## 155       w7 -1.780729e-02
## 156      w83  1.755898e-02
## 157     d40c -1.730106e-02
## 158       w6 -1.702210e-02
## 159      w99  1.676012e-02
## 160      w86  1.616610e-02
## 161     qa4a -1.543773e-02
## 162     d15b  1.527444e-02
## 163   qc13_5 -1.478621e-02
## 164   qc12_9  1.420123e-02
## 165      w14 -1.353171e-02
## 166      qc3 -1.322936e-02
## 167   qc12_7 -1.269117e-02
## 168    qc6_9  1.222451e-02
## 169      qc7 -1.208909e-02
## 170       w1 -1.179826e-02
## 171       w5 -1.173662e-02
## 172    qc6_8  1.173133e-02
## 173  qc12_13  1.133439e-02
## 174      qc8 -1.051202e-02
## 175    qc6_3 -1.026962e-02
## 176     p7fr -9.833273e-03
## 177   p7fr_r -9.373501e-03
## 178     p7cz -9.242256e-03
## 179       p2 -9.232808e-03
## 180    qc6_7  9.021552e-03
## 181     p7hu  8.947736e-03
## 182   qc12_5  8.808118e-03
## 183       p3 -8.225566e-03
## 184     qa13  7.855707e-03
## 185      w23 -7.268646e-03
## 186      wex -7.268645e-03
## 187   qc13_9 -6.882890e-03
## 188      w22 -6.743143e-03
## 189     p7nl -6.721291e-03
## 190     p7es  5.942449e-03
## 191       w3  4.828364e-03
## 192      p3r -3.139806e-03
## 193      w87 -1.910455e-03
## 194     p7gb  1.732731e-03
## 195     d8r1  1.383810e-03
## 196     p7hr  3.642174e-04
## 197       p1 -6.137684e-05

We can see that non-response in qc19 is not random; it is highly correlated to numerous variables (both categorical and continuous) such as countries (various q1) or regions (p7) responses to LGBT/ discrimination questions (such as qc6, qc7, qc12, qc13 and qc18) and paradata/ survey cooperation (p5). Therefore, the 12% missingness of qc19 is definitely not MCAR, but at least MAR or even MNAR.

Data cleaning

“Don’t know” (DK) responses are especially problematic for ordinal variables, because leaving them as is would inflate support levels, while recoding them as 0 could misrepresent them as the most negative response. We will therefore code all DKs and refusals (and a few other special cases) as NA.

data <- read_dta("data/raw/ZA7575.dta")

data_correctly_coded <- data %>%
  mutate(
    # hard cases for me
    d72_1 = ifelse(d72_1 %in% c(5,6), NA, d72_1),
    d72_2 = ifelse(d72_2 %in% c(5,6), NA, d72_2),
    d60 = ifelse(d60 == 7, NA, d60),
    d25 = ifelse(d25 == 8, NA, d25),
    d8 = ifelse(d8 > 70, NA, d8), # subjective decision that no one can have more than 70 years of ed (even full-time professors)
    d7 = ifelse(d7 %in% c(15,97), NA, d7),
    d1 = ifelse(d1 %in% c(97,98), NA, d1),
    
    qa16_1 = ifelse(qa16_1 == 5, NA, qa16_1),
    qa16_2 = ifelse(qa16_2 == 5, NA, qa16_2),
    qa16_3 = ifelse(qa16_3 == 5, NA, qa16_3),
    qa16_4 = ifelse(qa16_4 == 5, NA, qa16_4),
    
    d71_1 = ifelse(d71_1 == 4, NA, d71_1),
    d71_2 = ifelse(d71_2 == 4, NA, d71_2),
    d71_3 = ifelse(d71_3 == 4, NA, d71_3),
    #qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
    #qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
    #qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
    #qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
    
    qb3_1 = ifelse(qb3_1 == 5, NA, qb3_1),
    qb3_2 = ifelse(qb3_2 == 5, NA, qb3_2),
    qb3_3 = ifelse(qb3_3 == 5, NA, qb3_3),
    qb3_4 = ifelse(qb3_4 == 5, NA, qb3_4),
    qb3_5 = ifelse(qb3_5 == 5, NA, qb3_5),
    qb3_6 = ifelse(qb3_6 == 5, NA, qb3_6),
    qb3_7 = ifelse(qb3_7 == 5, NA, qb3_7),
    
    qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
    qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
    qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
    qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
    qb4_5 = ifelse(qb4_5 == 5, NA, qb4_5),
    
    qb5_1 = ifelse(qb5_1 == 5, NA, qb5_1),
    qb5_2 = ifelse(qb5_2 == 5, NA, qb5_2),
    qb5_3 = ifelse(qb5_3 == 5, NA, qb5_3),
    qb5_4 = ifelse(qb5_4 == 5, NA, qb5_4),
    
    sd1_1 = ifelse(sd1_1 %in% c(3,4), NA, sd1_1),
    sd1_2 = ifelse(sd1_2 %in% c(3,4), NA, sd1_2),
    sd1_3 = ifelse(sd1_3 %in% c(3,4), NA, sd1_3),
    sd1_4 = ifelse(sd1_4 %in% c(3,4), NA, sd1_4),
    sd1_5 = ifelse(sd1_5 %in% c(3,4), NA, sd1_5),
    sd1_6 = ifelse(sd1_6 %in% c(3,4), NA, sd1_6),
    sd1_7 = ifelse(sd1_7 %in% c(3,4), NA, sd1_7),
    sd1_8 = ifelse(sd1_8 %in% c(3,4), NA, sd1_8),
    
    qc1_1 = ifelse(qc1_1 == 6, NA, qc1_1),
    qc1_2 = ifelse(qc1_2 == 6, NA, qc1_2),
    qc1_3 = ifelse(qc1_3 == 6, NA, qc1_3),
    qc1_4 = ifelse(qc1_4 == 6, NA, qc1_4),
    qc1_5 = ifelse(qc1_5 == 6, NA, qc1_5),
    qc1_6 = ifelse(qc1_6 == 6, NA, qc1_6),
    qc1_7 = ifelse(qc1_7 == 6, NA, qc1_7),
    qc1_8 = ifelse(qc1_8 == 6, NA, qc1_8),
    qc1_9 = ifelse(qc1_9 == 6, NA, qc1_9),
    qc1_10 = ifelse(qc1_10 == 6, NA, qc1_10),
    qc5_1 = ifelse(qc5_1 == 3, NA, qc5_1),
    qc5_2 = ifelse(qc5_2 == 3, NA, qc5_2),
    qc5_3 = ifelse(qc5_3 == 3, NA, qc5_3),
    qc5_4 = ifelse(qc5_4 == 3, NA, qc5_4),
    qc6_1 = ifelse(qc6_1 == 12, NA, qc6_1),
    qc6_2 = ifelse(qc6_2 == 12, NA, qc6_2),
    qc6_3 = ifelse(qc6_3 == 12, NA, qc6_3),
    qc6_4 = ifelse(qc6_4 == 12, NA, qc6_4),
    qc6_5 = ifelse(qc6_5 == 12, NA, qc6_5),
    qc6_6 = ifelse(qc6_6 == 12, NA, qc6_6),
    qc6_7 = ifelse(qc6_7 == 12, NA, qc6_7),
    qc6_8 = ifelse(qc6_8 == 12, NA, qc6_8),
    qc6_9 = ifelse(qc6_9 == 12, NA, qc6_9),
    qc6_10 = ifelse(qc6_10 == 12, NA, qc6_10),
    qc6_11 = ifelse(qc6_11 == 12, NA, qc6_11),
    qc9_1 = ifelse(qc9_1 == 7, NA, qc9_1),
    qc9_2 = ifelse(qc9_2 == 7, NA, qc9_2),
    qc9_3 = ifelse(qc9_3 == 7, NA, qc9_3),
    qc9_4 = ifelse(qc9_4 == 7, NA, qc9_4),
    qc9_5 = ifelse(qc9_5 == 7, NA, qc9_5),
    qc9_6 = ifelse(qc9_6 == 7, NA, qc9_6),
    qc9_7 = ifelse(qc9_7 == 7, NA, qc9_7),
    qc9_8 = ifelse(qc9_8 == 7, NA, qc9_8),
    qc9_9 = ifelse(qc9_9 == 7, NA, qc9_9),
    qc9_10 = ifelse(qc9_10 == 7, NA, qc9_10),
    qc9_11 = ifelse(qc9_11 == 7, NA, qc9_11),
    qc11_1 = ifelse(qc11_1 == 6, NA, qc11_1),
    qc11_2 = ifelse(qc11_2 == 6, NA, qc11_2),
    qc11_3 = ifelse(qc11_3 == 6, NA, qc11_3),
    qc11_4 = ifelse(qc11_4 == 6, NA, qc11_4),
    qc11_5 = ifelse(qc11_5 == 6, NA, qc11_5),
    qc11_6 = ifelse(qc11_6 == 6, NA, qc11_6),
    qc12_1 = ifelse(qc12_1 %in% c(11,12,13), NA, qc12_1),
    qc12_2 = ifelse(qc12_2 %in% c(11,12,13), NA, qc12_2),
    qc12_3 = ifelse(qc12_3 %in% c(11,12,13), NA, qc12_3),
    qc12_4 = ifelse(qc12_4 %in% c(11,12,13), NA, qc12_4),
    qc12_5 = ifelse(qc12_5 %in% c(11,12,13), NA, qc12_5),
    qc12_6 = ifelse(qc12_6 %in% c(11,12,13), NA, qc12_6),
    qc12_7 = ifelse(qc12_7 %in% c(11,12,13), NA, qc12_7),
    qc12_8 = ifelse(qc12_8 %in% c(11,12,13), NA, qc12_8),
    qc12_9 = ifelse(qc12_9 %in% c(11,12,13), NA, qc12_9),
    qc12_10 = ifelse(qc12_10 %in% c(11,12,13), NA, qc12_10),
    qc12_11 = ifelse(qc12_11 %in% c(11,12,13), NA, qc12_11),
    qc12_12 = ifelse(qc12_12 %in% c(11,12,13), NA, qc12_12),
    qc12_13 = ifelse(qc12_13 %in% c(11,12,13), NA, qc12_13),
    qc12_14 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_14),
    qc12_15 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_15),
    qc13_1 = ifelse(qc13_1 %in% c(11,12,13), NA, qc13_1),
    qc13_2 = ifelse(qc13_2 %in% c(11,12,13), NA, qc13_2),
    qc13_3 = ifelse(qc13_3 %in% c(11,12,13), NA, qc13_3),
    qc13_4 = ifelse(qc13_4 %in% c(11,12,13), NA, qc13_4),
    qc13_5 = ifelse(qc13_5 %in% c(11,12,13), NA, qc13_5),
    qc13_6 = ifelse(qc13_6 %in% c(11,12,13), NA, qc13_6),
    qc13_7 = ifelse(qc13_7 %in% c(11,12,13), NA, qc13_7),
    qc13_8 = ifelse(qc13_8 %in% c(11,12,13), NA, qc13_8),
    qc13_9 = ifelse(qc13_9 %in% c(11,12,13), NA, qc13_9),
    qc13_10 = ifelse(qc13_10 %in% c(11,12,13), NA, qc13_10),
    qc13_11 = ifelse(qc13_11 %in% c(11,12,13), NA, qc13_11),
    qc13_12 = ifelse(qc13_12 %in% c(11,12,13), NA, qc13_12),
    qc13_13 = ifelse(qc13_13 %in% c(11,12,13), NA, qc13_13),
    qc13_14 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_14),
    qc13_15 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_15),
    qc15_1 = ifelse(qc15_1 == 5, NA, qc15_1),
    qc15_2 = ifelse(qc15_2 == 5, NA, qc15_2),
    qc15_3 = ifelse(qc15_3 == 5, NA, qc15_3),
    qc16_1 = ifelse(qc16_1 == 5, NA, qc16_1),
    qc17_1 = ifelse(qc17_1 == 5, NA, qc17_1),
    qc17_2 = ifelse(qc17_2 == 5, NA, qc17_2),
    qc17_3 = ifelse(qc17_3 == 5, NA, qc17_3),
    qc17_4 = ifelse(qc17_4 == 5, NA, qc17_4),
    qc17_5 = ifelse(qc17_5 == 5, NA, qc17_5),
    qc17_6 = ifelse(qc17_6 == 5, NA, qc17_6),
    qc17_7 = ifelse(qc17_7 == 5, NA, qc17_7),
    qc18_1 = ifelse(qc18_1 %in% c(11,12), NA, qc18_1),
    qc18_2 = ifelse(qc18_2 %in% c(11,12), NA, qc18_2),
    qc18_3 = ifelse(qc18_3 %in% c(11,12), NA, qc18_3),
    
    sd3 = ifelse(sd3 %in% c(15,16), NA, sd3),
    
    qa1 = ifelse(qa1 == 5, NA, qa1),
    qa7 = ifelse(qa7 == 5, NA, qa7),
    qa8 = ifelse(qa8 == 4, NA, qa8),
    qa9 = ifelse(qa9 == 6, NA, qa9),
    qa14 = ifelse(qa14 == 6, NA, qa14),
    qa17 = ifelse(qa17 == 5, NA, qa17),
    
    qb6 = ifelse(qb6 == 4, NA, qb6),
    qb7 = ifelse(qb7 == 5, NA, qb7),
    
    qc19 = ifelse(qc19 == 3, NA, qc19),
    qc20 = ifelse(qc20 == 3, NA, qc20),
    
    d63 = ifelse(d63 %in% c(6,7,8,9), NA, d63), # manually because stupid Claude
    d77 = ifelse(d77 == 5, NA, d77),
    d70 = ifelse(d70 == 5, NA, d70),
    
    qa4a = ifelse(qa4a %in% c(7,8), NA, qa4a),
    qa5a = ifelse(qa5a %in% c(11,12,13), NA, qa5a),
    qa11 = ifelse(qa11 %in% c(4,5), NA, qa11),
    qa12 = ifelse(qa12 %in% c(5,6), NA, qa12),
    qa13 = ifelse(qa13 %in% c(6,7), NA, qa13),
    qa18a = ifelse(qa18a %in% c(7,8,9), NA, qa18a),
    
    qb8 = ifelse(qb8 == 5, NA, qb8),
    sd3 = ifelse(sd3 == 16, NA, sd3),
    qc3 = ifelse(qc3 %in% c(10,11), NA, qc3),
    qc7 = ifelse(qc7 %in% c(11,12), NA, qc7),
    qc8 = ifelse(qc8 %in% c(11,12), NA, qc8),
    qc10 = ifelse(qc10 %in% c(9,10), NA, qc10))

###
# get rid of all the "r" coded ones
# for the questions with underscores, (1) if they are on a scale, e.g., 1-3 and 'DK'
# is one of them, replace 'DK'; if it's 0-1 coding, keep that (can't do anything about it)

data_correctly_coded <- data_correctly_coded %>%
  select(-matches("_r$|_r[0-9]$|_r[0-9]_")) 
# use setdiff() to check whether it's done properly 


### 
# now analyse the patterns of NAs across columns
missing_data_before <- data_correctly_coded %>%
  summarise(across(everything(), ~sum(is.na(.))/n())) %>%
  pivot_longer(cols = everything(), 
               names_to = "variable", 
               values_to = "missing_proportion") %>%
  arrange(desc(missing_proportion))

# view top variables with missing data
print(head(missing_data_before, 20))
## # A tibble: 20 × 2
##    variable missing_proportion
##    <chr>                 <dbl>
##  1 p6mt                  0.982
##  2 p13mt                 0.982
##  3 p6cy                  0.982
##  4 p7cy                  0.982
##  5 p6lu                  0.981
##  6 p7lu                  0.981
##  7 p13lu                 0.981
##  8 p6hr                  0.964
##  9 p7hr                  0.964
## 10 p6ee                  0.963
## 11 p6fi                  0.963
## 12 p6lt                  0.963
## 13 p7ee                  0.963
## 14 p7fi                  0.963
## 15 p7lt                  0.963
## 16 p13ee                 0.963
## 17 p13fi                 0.963
## 18 p6dk                  0.963
## 19 p7dk                  0.963
## 20 p6es                  0.963
# viz
ggplot(missing_data_before %>% filter(missing_proportion > 0.25), 
       aes(x = reorder(variable, missing_proportion), y = missing_proportion)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variables with >25% Missing Data",
       x = "Variable",
       y = "Proportion Missing") +
  theme_minimal()

# calculate overall proportion of missing data
mean(missing_data_before$missing_proportion)
## [1] 0.1424237
# deeper analysis
missing_by_prefix <- missing_data_before %>%
  mutate(prefix = str_extract(variable, "^[a-z]+\\d+")) %>%
  group_by(prefix) %>%
  summarise(
    avg_missing = mean(missing_proportion),
    n_vars = n(),
    max_missing = max(missing_proportion),
    min_missing = min(missing_proportion)
  ) %>%
  arrange(desc(avg_missing))

print(head(missing_by_prefix, 15))
## # A tibble: 15 × 5
##    prefix avg_missing n_vars max_missing min_missing
##    <chr>        <dbl>  <int>       <dbl>       <dbl>
##  1 p13          0.945      8       0.982      0.779 
##  2 p6           0.931     29       0.982      0     
##  3 p7           0.930     28       0.982      0.0180
##  4 qc3          0.861      1       0.861      0.861 
##  5 qa3          0.679      7       0.679      0.679 
##  6 qa15         0.627      5       0.627      0.627 
##  7 qa2          0.376      8       0.376      0.376 
##  8 d15          0.261      2       0.522      0     
##  9 qb7          0.208      1       0.208      0.208 
## 10 d40          0.157      5       0.784      0     
## 11 qa13         0.124      1       0.124      0.124 
## 12 qc19         0.120      1       0.120      0.120 
## 13 qc16         0.117      1       0.117      0.117 
## 14 qc20         0.116      1       0.116      0.116 
## 15 qc10         0.109      1       0.109      0.109
# we take out the following variables because (1) they exhibited high missingness
# while at the same aren't super releveant (that's an assumption we make) for our
# subsequent analysis
data_correctly_coded <- data_correctly_coded %>%
  select(
    # Exclude variables with specific prefixes
    -starts_with("p6"),   # Size of locality
    -starts_with("p7"),   # Region
    -starts_with("p13"),  # Language of interview
    -starts_with("d40"),  # Household size
    -starts_with("qa3"),  # Not benefitting from trade
    -starts_with("qa15"), # Countries bought goods from
    -starts_with("qa2"),  # Benefitting from trade
    -starts_with("qb8"),  # EU energy label influence
    -starts_with("qa18b"), # Information sources follow-up
    -starts_with("qc3")   # Discrimination circumstances
  )

# check how much we imporved the missingness rate
missing_data_after <- data_correctly_coded %>%
  summarise(across(everything(), ~sum(is.na(.))/n())) %>%
  pivot_longer(cols = everything(), 
               names_to = "variable", 
               values_to = "missing_proportion") %>%
  arrange(desc(missing_proportion))

mean(missing_data_after$missing_proportion)
## [1] 0.02018962
# prepare for imputation
data_correctly_coded <-  data_correctly_coded %>%
  mutate(isocntry = as.factor(isocntry)) %>%
  # remove other problematic variables (like IDs) if needed
  select(!any_of(c("doi", "version", "caseid", "uniqid", "serialid"))) %>%
  sjlabelled::remove_all_labels()

# delete variables with zero variance as they cannot ever be useful predictors
var_zero <- data_correctly_coded %>% 
  select(where(is.numeric)) %>% 
  summarise(across(everything(), ~ var(.x, na.rm = TRUE))) %>% 
  pivot_longer(everything(), names_to = "variable", values_to = "variance") %>% 
  filter(variance == 0 | is.na(variance))

if(nrow(var_zero) > 0) {
  cat("Removing", nrow(var_zero), "variables with zero variance\n")
  data_correctly_coded <- data_correctly_coded %>%
    select(!any_of(var_zero$variable))
}
## Removing 6 variables with zero variance
data_correctly_coded <- data_correctly_coded %>%
  # convert country codes: collapse East and West Germany into one DE
  mutate(isocntry = case_when(
    isocntry %in% c("DE-W", "DE-E") ~ "DE",
    TRUE ~ isocntry))

###
# run the imputation
set.seed(1212)

start_time <- Sys.time()
#imp_model <- mice(data_correctly_coded, m = 3, method = 'rf', maxit = 3)
end_time <- Sys.time()
end_time - start_time
## Time difference of 0.0008299351 secs

Data cleaning: external datasets

We also needed to make decisions about whether or not to impute missing variables from a few of our external datasets. We chose to delete the variables from the ESS because the Eurobarometer survey already contains similar information and imputing these values didn’t seem appropriate. However, we did choose to impute the happiness score using KNN.

country_level_df <- readRDS("country_level_df.rds")

# first, let's check which variables have missing values
missing_summary <- colSums(is.na(country_level_df))
missing_summary <- missing_summary[missing_summary > 0]
print(missing_summary)
##         Happiness_Score           n_respondents                 n_valid 
##                       1                       4                       4 
##    lgbt_support_percent        mean_religiosity         mean_left_right 
##                       4                       4                       4 
##       mean_equal_values     mean_country_attach             mean_eduyrs 
##                       4                       4                       4 
##                mean_age               pct_young           pct_high_educ 
##                       4                       4                       4 
##         se_lgbt_support        pct_missing_lgbt           age_lgbt_corr 
##                       4                       4                       4 
##         relig_lgbt_corr lgbt_support_inequality           educ_gradient 
##                       4                       4                       4 
##   gender_equality_index          z_lgbt_support       norm_lgbt_support 
##                       1                       4                       4 
##             z_happiness       z_gender_equality    norm_gender_equality 
##                       1                       1                       1 
##           z_religiosity            z_left_right          z_equal_values 
##                       4                       4                       4 
##        z_country_attach                z_eduyrs                   z_age 
##                       4                       4                       4
# Visualize missing data patterns
vis_miss(country_level_df)

# (1) 
# delete ESS variables
ESS_variables <- c(
  # original survey variables
  "n_respondents", "n_valid", "lgbt_support_percent", "mean_religiosity", 
  "mean_left_right", "mean_equal_values", "mean_country_attach", 
  "mean_eduyrs", "mean_age", "pct_young", "pct_high_educ", "se_lgbt_support",
  
  # additional ESS metrics
  "pct_missing_lgbt", "age_lgbt_corr", "relig_lgbt_corr", 
  "lgbt_support_inequality", "educ_gradient",
  
  # z-score transformations of ESS variables
  "z_religiosity", "z_left_right", "z_equal_values", 
  "z_country_attach", "z_eduyrs", "z_age")

country_level_df <- country_level_df %>%
  select(-all_of(ESS_variables))

# (2)
# create separate datasets for variables that need imputation
missing_variables <- names(missing_summary)[missing_summary < 5 & 
                                             !(names(missing_summary) %in% ESS_variables)]

# for our KNN imputation, we need to identify variables to use as predictors
# these should be variables without NAs that also correlate with those we want to impute

# identify complete variables that could serve as predictors
complete_variables <- names(country_level_df)[colSums(is.na(country_level_df)) == 0]
complete_variables <- complete_variables[!complete_variables %in% c("Unnamed: 0", "country_name", "iso2", "region")]

print(complete_variables)
##  [1] "v2x_libdem"                      "v2x_polyarchy"                  
##  [3] "v2x_gender"                      "v2x_egaldem"                    
##  [5] "v2x_liberal"                     "v2xcs_ccsi"                     
##  [7] "v2x_freexp"                      "Overall_score"                  
##  [9] "Electoral_process_and_pluralism" "Functioning_of_government"      
## [11] "Political_participation"         "Political_culture"              
## [13] "Civil_liberties"                 "Regime_type"                    
## [15] "gdp_2005"                        "gdp_2018"                       
## [17] "gdp_growth"                      "rainbow_score_2019"             
## [19] "rainbow_score_2018"              "rainbow_score_avg_2019_2018"    
## [21] "rainbow_score_avg_2013_2014"     "rainbow_score_difference"       
## [23] "Unemployment"                    "Employment"                     
## [25] "z_v2x_libdem"                    "z_v2x_polyarchy"                
## [27] "z_v2x_gender"                    "z_v2x_egaldem"                  
## [29] "z_v2x_liberal"                   "z_v2xcs_ccsi"                   
## [31] "z_v2x_freexp"                    "z_gdp_2018"                     
## [33] "log_gdp_2018"                    "z_gdp_growth"                   
## [35] "z_unemployment"                  "z_rainbow_score"                
## [37] "norm_rainbow_score"              "z_functioning_of_government"    
## [39] "composite_equality"              "composite_democracy"
# custom function to impute variables with KNN, handling non-numeric data
knn_impute <- function(data, vars_to_impute, predictor_vars, k = 5) {
  # create a copy of the data
  imputed_data <- data
  
  # ensure predictor variables are numeric and complete
  pred_data <- data[, predictor_vars, drop = FALSE]
  pred_data <- as.data.frame(lapply(pred_data, function(x) as.numeric(x)))
  
  # remove any non-numeric columns
  numeric_cols <- sapply(pred_data, is.numeric)
  if(any(!numeric_cols)) {
    warning("Removing non-numeric predictor columns: ", 
            paste(names(pred_data)[!numeric_cols], collapse=", "))
    pred_data <- pred_data[, numeric_cols, drop=FALSE]
    predictor_vars <- predictor_vars[numeric_cols]
  }
  
  # handle missing values in predictor variables by using column means
  for(col in names(pred_data)) {
    if(any(is.na(pred_data[[col]]))) {
      pred_data[[col]][is.na(pred_data[[col]])] <- mean(pred_data[[col]], na.rm=TRUE)
    }
  }
  
  # standardize the predictor variables manually to avoid issues
  pred_data_std <- as.data.frame(scale(pred_data))
  
  # for each variable to impute:
  for (var in vars_to_impute) {
    if (var %in% names(data) && sum(is.na(data[[var]])) > 0) {
      # ensure the target variable is numeric
      if (!is.numeric(data[[var]])) {
        cat("Skipping", var, "as it is not numeric\n")
        next
      }
      
      # identify cases with missing values
      missing_indices <- which(is.na(data[[var]]))
      
      # for each missing value:
      for (idx in missing_indices) {
        # calculate Euclidean distances to all other countries
        distances <- numeric(nrow(data))
        
        for (i in 1:nrow(data)) {
          if (i != idx) {
            # calculate squared differences for each predictor
            squared_diffs <- sapply(names(pred_data_std), function(p) {
              (pred_data_std[idx, p] - pred_data_std[i, p])^2
            })
            
            # sum and take square root for Euclidean distance
            distances[i] <- sqrt(sum(squared_diffs, na.rm=TRUE))
          } else {
            distances[i] <- Inf  # don't use the country itself
          }
        }
        
        # find k nearest neighbors with non-missing values for this variable
        valid_neighbors <- which(!is.na(data[[var]]) & !is.infinite(distances))
        
        if (length(valid_neighbors) > 0) {
          # order the neighbors by distance
          neighbor_order <- order(distances[valid_neighbors])
          valid_neighbors <- valid_neighbors[neighbor_order]
          
          # take the k nearest, or as many as available if fewer than k
          k_actual <- min(k, length(valid_neighbors))
          nearest_k <- valid_neighbors[1:k_actual]
          
          # impute as the average of the k nearest neighbors
          imputed_data[idx, var] <- mean(data[nearest_k, var], na.rm = TRUE)
          
          cat("Imputed", var, "for", data$country_name[idx], 
              "using neighbors:", paste(data$country_name[nearest_k], collapse=", "), "\n")
        } else {
          cat("Warning: Could not impute", var, "for", data$country_name[idx], 
              "- no valid neighbors found\n")
        }
      }
    }
  }
  
  return(imputed_data)
}

# now use the function to impute variables with missing values
# first ensure we have the right variable types
country_df_numeric <- country_level_df

for (var in missing_variables) {
  if (var %in% names(country_level_df) && !is.numeric(country_level_df[[var]])) {
    country_df_numeric[[var]] <- as.numeric(as.character(country_level_df[[var]]))
  }
}

# run the imputation with proper error handling
tryCatch({
  country_df_imputed <- knn_impute(
    data = country_df_numeric,
    vars_to_impute = missing_variables,
    predictor_vars = complete_variables,
    k = 3  # using 3 nearest neighbors
  )
  print("Imputation completed successfully!")
}, error = function(e) {
  # if the knn_impute function still fails, let's try an alternative approach
  print(paste("KNN imputation error:", e$message))
  print("Switching to a simpler mean imputation approach...")
  
  country_df_imputed <- country_df_numeric
  for (var in missing_variables) {
    if (sum(is.na(country_df_imputed[[var]])) > 0) {
      # Calculate mean of non-missing values
      var_mean <- mean(country_df_imputed[[var]], na.rm = TRUE)
      # Replace missing values with mean
      country_df_imputed[[var]][is.na(country_df_imputed[[var]])] <- var_mean
      print(paste("Imputed", var, "with mean value:", var_mean))
    }
  }
  return(country_df_imputed)
})
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Imputed Happiness_Score for Belgium using neighbors: France, Portugal, Denmark 
## Imputed gender_equality_index for United Kingdom using neighbors: Netherlands, Austria, France 
## Imputed z_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany 
## Imputed z_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy 
## Imputed z_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria 
## Imputed z_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia 
## Imputed norm_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany 
## Imputed norm_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy 
## Imputed norm_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria 
## Imputed norm_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia 
## Imputed z_happiness for Belgium using neighbors: France, Portugal, Denmark 
## Imputed z_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France 
## Imputed norm_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France 
## [1] "Imputation completed successfully!"

Country aggregates

# load the data
df_rf_new <- read_csv("df_rf_new.csv") # the imputed dataset
## New names:
## Rows: 27438 Columns: 475
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): isocntry, nuts dbl (473): ...1, tnscntry, country, d11, d11r1, d11r2,
## d11r3, gen1, gen2, ge...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
country_df_imputed <- readRDS("country_df_imputed.rds") # external dataset

# custom function to calculate country-level aggregates
calculate_country_aggregates <- function(df) {
  country_agg <- df %>%
    group_by(isocntry) %>%
    summarize(
      
      iso2 = first(isocntry),
      
      # demographics
      mean_age = mean(d11, na.rm = TRUE),
      median_age = median(d11, na.rm = TRUE),
      
      # life satisfaction (recoded)
      mean_life_satisfaction = mean(5 - d70, na.rm = TRUE), 
      pct_satisfied = mean(d70 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # political discussions
      mean_natl_political_discuss = mean(d71_1, na.rm = TRUE),
      mean_eu_political_discuss = mean(d71_2, na.rm = TRUE),
      mean_local_political_discuss = mean(d71_3, na.rm = TRUE),
      pct_discuss_natl_politics = mean(d71_1 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_discuss_eu_politics = mean(d71_2 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_discuss_local_politics = mean(d71_3 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # trade and globalization (recoded)
      mean_trade_support = mean(5 - qa1, na.rm = TRUE),
      pct_trade_support = mean(qa1 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_pro_globalization = mean(qa5a %in% c(1, 2, 3, 5, 7, 8), na.rm = TRUE) * 100,
      pct_anti_globalization = mean(qa5a %in% c(4, 6, 9, 10), na.rm = TRUE) * 100,
      mean_eu_trade_support = mean(5 - qa7, na.rm = TRUE),
      pct_support_eu_trade = mean(qa7 %in% c(1, 2), na.rm = TRUE) * 100,
      mean_esg_support = mean(5 - qa9, na.rm = TRUE),
      pct_support_esg = mean(qa9 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # political and social indicators
      mean_left_right = mean(d1, na.rm = TRUE),
      pct_left = mean(d1 %in% c(1, 2, 3, 4), na.rm = TRUE) * 100,
      pct_center = mean(d1 %in% c(5, 6), na.rm = TRUE) * 100,
      pct_right = mean(d1 %in% c(7, 8, 9, 10), na.rm = TRUE) * 100,
      mean_education_years = mean(d8, na.rm = TRUE),
      median_education_years = median(d8, na.rm = TRUE),
      pct_high_education = mean(d8 >= 20, na.rm = TRUE) * 100,
      pct_rural = mean(d25 == 1, na.rm = TRUE) * 100,
      pct_urban = mean(d25 %in% c(2, 3), na.rm = TRUE) * 100,
      pct_large_urban = mean(d25 == 3, na.rm = TRUE) * 100,
      mean_financial_difficulty = mean(4 - d60, na.rm = TRUE),
      pct_financial_difficulty = mean(d60 %in% c(1, 2), na.rm = TRUE) * 100,
      mean_subjective_class = mean(d63, na.rm = TRUE),
      pct_working_class = mean(d63 == 1, na.rm = TRUE) * 100,
      pct_middle_class = mean(d63 %in% c(2, 3, 4), na.rm = TRUE) * 100,
      pct_upper_class = mean(d63 == 5, na.rm = TRUE) * 100,
      mean_voice_in_eu = mean(5 - d72_1, na.rm = TRUE),
      mean_voice_in_country = mean(5 - d72_2, na.rm = TRUE),
      pct_voice_in_eu = mean(d72_1 %in% c(1, 2), na.rm = TRUE) * 100,
      pct_voice_in_country = mean(d72_2 %in% c(1, 2), na.rm = TRUE) * 100,
      
      # diversity indicators
      pct_friends_diff_ethnic = mean(sd1_1 == 1, na.rm = TRUE) * 100,
      pct_friends_diff_skin = mean(sd1_2 == 1, na.rm = TRUE) * 100,
      pct_friends_roma = mean(sd1_3 == 1, na.rm = TRUE) * 100,
      pct_friends_lgbt = mean(sd1_4 == 1, na.rm = TRUE) * 100,
      pct_friends_disabled = mean(sd1_5 == 1, na.rm = TRUE) * 100,
      pct_friends_diff_religion = mean(sd1_6 == 1, na.rm = TRUE) * 100,
      pct_friends_transgender = mean(sd1_7 == 1, na.rm = TRUE) * 100,
      pct_friends_intersex = mean(sd1_8 == 1, na.rm = TRUE) * 100,
      pct_ethnic_minority = mean(sd2_1 == 1, na.rm = TRUE) * 100,
      pct_skin_minority = mean(sd2_2 == 1, na.rm = TRUE) * 100,
      pct_religious_minority = mean(sd2_3 == 1, na.rm = TRUE) * 100,
      pct_roma = mean(sd2_4 == 1, na.rm = TRUE) * 100,
      pct_sexual_minority = mean(sd2_5 == 1, na.rm = TRUE) * 100,
      pct_disability = mean(sd2_6 == 1, na.rm = TRUE) * 100,
      pct_other_minority = mean(sd2_7 == 1, na.rm = TRUE) * 100,
      pct_any_minority = mean(sd2_1 == 1 | sd2_2 == 1 | sd2_3 == 1 | sd2_4 == 1 | 
                                sd2_5 == 1 | sd2_6 == 1 | sd2_7 == 1, na.rm = TRUE) * 100,
      
      # religion
      pct_catholic = mean(sd3 == 1, na.rm = TRUE) * 100,
      pct_orthodox = mean(sd3 == 2, na.rm = TRUE) * 100,
      pct_protestant = mean(sd3 == 3, na.rm = TRUE) * 100,
      pct_other_christian = mean(sd3 == 4, na.rm = TRUE) * 100,
      pct_jewish = mean(sd3 == 5, na.rm = TRUE) * 100,
      pct_muslim = mean(sd3 %in% c(6, 7, 8), na.rm = TRUE) * 100,
      pct_atheist = mean(sd3 == 13, na.rm = TRUE) * 100,
      pct_nonbeliever = mean(sd3 == 14, na.rm = TRUE) * 100,
      pct_nonreligious = mean(sd3 %in% c(13, 14), na.rm = TRUE) * 100,
      
      # subjective discrimination perception (recoded)
      mean_discrim_ethnic = mean(5 - qc1_1, na.rm = TRUE),
      mean_discrim_skin = mean(5 - qc1_2, na.rm = TRUE),
      mean_discrim_roma = mean(5 - qc1_3, na.rm = TRUE),
      mean_discrim_lgbt = mean(5 - qc1_4, na.rm = TRUE),
      mean_discrim_age = mean(5 - qc1_5, na.rm = TRUE),
      mean_discrim_religion = mean(5 - qc1_6, na.rm = TRUE),
      mean_discrim_disability = mean(5 - qc1_7, na.rm = TRUE),
      mean_discrim_transgender = mean(5 - qc1_8, na.rm = TRUE),
      mean_discrim_gender = mean(5 - qc1_9, na.rm = TRUE),
      mean_discrim_intersex = mean(5 - qc1_10, na.rm = TRUE),
      
    )
  return(country_agg)
}

# apply the function
country_aggregates <- calculate_country_aggregates(df_rf_new)

# join 
country_data_combined <- country_df_imputed %>%
  left_join(country_aggregates, by = "iso2")

# while it's less relevant for a ML model such as random forest, for a multi-level 
# regression model, it's important to standardize the values to allow for better
# coefficient interpretation; the goal is to have both z-scores and 0-1 normalised
# values for each of the original variables (except for variables like 'region')

country_data_combined <- country_data_combined %>%
  # first handle all numeric variables that aren't already standardized
  mutate(
    # z-score standardization
    across(where(is.numeric) & 
             !starts_with("z_") & 
             !starts_with("norm_") &
             !matches("country|iso"), 
           list(z = ~scale(.)[,1]),
           .names = "z_{.col}"),
    
    # also create 0-1 normalized versions
    across(where(is.numeric) & 
             !starts_with("z_") & 
             !starts_with("norm_") &
             !matches("country|iso"), 
           list(norm = ~(.-min(., na.rm=TRUE))/(max(., na.rm=TRUE)-min(., na.rm=TRUE))),
           .names = "norm_{.col}"))

# save the combined dataset
saveRDS(country_data_combined, file = "country_data_combined.rds")
write_csv(country_data_combined, "country_data_combined.csv")

# join with df_rf; the actual, imputed survey data
df_rf_enriched_new <- df_rf_new %>%
  # ISO codes as key
  left_join(country_data_combined, by = c("isocntry" = "iso2"))

# save final dataset
write_csv(df_rf_enriched_new, "df_rf_enriched_new.csv")

Descriptive analysis 1

# load data
country_level_df <- readRDS("country_level_df.rds")

# plot 1: employment and unemployment Rates
p_1 <- ggplot(country_level_df, aes(x = reorder(country_name, Employment))) +
  geom_bar(aes(y = Employment), stat = "identity", fill = "steelblue", alpha = 0.7) +
  geom_point(aes(y = Unemployment), color = "darkred", size = 3) +
  scale_y_continuous(
    name = "Employment Rate (%)",
    limits = c(0, 100),
    sec.axis = sec_axis(~., name = "Unemployment Rate (%)")
  ) +
  labs(title = "Employment and Unemployment Rates by Country (2018)",
       subtitle = "Bars: Employment rate | Points: Unemployment rate",
       x = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.y.left = element_text(color = "steelblue"),
        axis.title.y.right = element_text(color = "darkred"))

# plot 2: GDP per capita and GDP per capita growth rate
p_2 <- ggplot(country_level_df, aes(x = reorder(country_name, gdp_2018))) +
  geom_bar(aes(y = gdp_2018), stat = "identity", fill = "steelblue", alpha = 0.7) +
  geom_point(aes(y = gdp_growth * 500), color = "darkred", size = 3) +  # Scale growth to fit on same axis
  scale_y_continuous(
    name = "GDP per Capita (USD)",
    labels = comma,
    sec.axis = sec_axis(~./500, name = "GDP Growth 2005-2018 (%)")
  ) +
  labs(title = "GDP per Capita (2018) and Growth Rate",
       subtitle = "Bars: GDP per capita | Points: Growth rate",
       x = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.y.left = element_text(color = "steelblue"),
        axis.title.y.right = element_text(color = "darkred"))

# plot 3: relationship between employment and LGBT support
p_3 <- ggplot(country_level_df, aes(x = Employment, y = lgbt_support_percent)) +
  geom_point(aes(color = Unemployment, size = rainbow_score_2019)) +
  geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
  scale_color_viridis_c(option = "B", name = "Unemployment\nRate (%)") +
  scale_size_continuous(name = "Rainbow\nScore 2019") +
  labs(title = "Relationship between Employment Rates and LGBT Support",
       subtitle = "Color indicates unemployment rate, size shows level of LGBT legal protection",
       x = "Employment Rate (%)",
       y = "LGBT Support (%)") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())


# plot 4: relationship between GDP per capita to LGBT support
p_4 <- ggplot(country_level_df, aes(x = gdp_2018, y = lgbt_support_percent)) +
  geom_point(aes(color = rainbow_score_2019, size = gdp_growth)) +
  geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
  scale_color_viridis_c(option = "E", name = "Rainbow\nScore 2019") +
  scale_size_continuous(name = "GDP Growth\n2005-2018 (%)") +
  scale_x_continuous(labels = comma) +
  labs(title = "Relationship between GDP, LGBT Support and Rights Protections",
       subtitle = "Size indicates GDP growth rate from 2005-2018",
       x = "GDP per Capita (2018, USD)",
       y = "LGBT Support (%)") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())

# display the plots
p_1

p_2

p_3
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).

p_4
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).

Reduced dataset

We will use this reduced dataset when building our explanatory model. We chose these variables based on previous literature on transgender rights and the results of our descriptive analysis.

#load data
df_reduced <- df_rf_enriched_new %>%
  select(
    # target
    qc19,
    
    # country identifier
    country, country_name, isocntry,
    
    # individual demographics
    age = d11,           
    gender = d10,            
    education = d8, d8r2,      
    occupation = d15a,           
    urban_rural = d25,            
    financial_insecurity = d60,            
    social_class = d63,         
    religion = sd3,            
    political_ideology = d1, 
    lgb_friends = sd1_4,
    trans_friends = sd1_7,
    
    # Personal identity
    ethnic_minority = sd2_1,
    skin_color_minority = sd2_2,
    religious_minority = sd2_3,
    sexual_lgbt_minority = sd2_5,
    disability_minority = sd2_6,
    other_minority = sd2_7,
    none_minority = sd2_8,
    
    # Discrimination
    trans_discrimination_country = qc1_8,
    trans_discrimination_personal = qc2_6,
    trans_discrimination_workplace = qc4_8,
    trans_discrimination_political = qc6_10,
    country_discrimination_efforts = qc7,
    country_discrimination_efforts_recoded = qc7r,
    trans_workplace_diversity = qc9_10,
    trans_colleague = qc12_11,
    trans_colleague_recoded = qc12_11r,
    trans_child_relationship = qc13_11,
    trans_child_relationship_recoded = qc13_11r,
    lgb_rights = qc15_1, 
    same_sex_relationship = qc15_2, 
    same_sex_marriage = qc15_3,
    lgb_school_materials = qc17_3,
    trans_school_materials = qc17_4,
    intersex_school_materials = qc17_5,
    two_men_public_affection = qc18_2,
    two_men_public_affection_recoded = qc18_2r,
    two_women_public_affection = qc18_3,
    two_women_public_affection_recoded = qc18_3r,
    non_gendered_docs = qc20,
    
    # Country-level variables
    gdp_2018,                # Economic development
    rainbow_score_2019,      # LGBTI rights/protections
    gender_equality_index,   # Gender equality
    Happiness_Score,         # National well-being
    gdp_2018,                # Economic development
    rainbow_score_2019,      # LGBTI rights/protections
    rainbow_score_2018,      # LGBTI rights/protections (previous year)
    rainbow_score_avg_2019_2018, # Average LGBTI rights/protections
    gender_equality_index,   # Gender equality
    Happiness_Score,         # National well-being
    v2x_libdem,              # Democratic quality
    v2x_egaldem,             # Democratic quality
    Regime_type,             # Democratic quality
    z_functioning_of_government, #Govenment function measure
    composite_equality,     # Equality measure
    z_composite_equality,   # Standardized equality measure
    norm_composite_equality, # Normalized equality measure
    z_lgbt_support,         # Standardized LGBT support
    norm_lgbt_support,      # Normalized LGBT support
    pct_friends_lgbt,       # Percentage of LGBT friends
    z_pct_friends_lgbt,     # Standardized % of LGBT friends
    norm_pct_friends_lgbt,  # Normalized % of LGBT friends
    mean_left_right,         # Average political leaning
    pct_high_education,      # Education level
    region,                  # Geographic region
    
    # Paradata
    interview_date = p1,                      
    interview_start_time = p2,                      
    interview_duration = p3,                      
    interview_duration_recoded = p3r,                     
    people_present_during_interview = p4,                      
    respondent_cooperation = p5                       
  )

# Save to a new file
write_rds(df_reduced, "df_reduced.rds")

Descriptive analysis 2

# load datasets
df_reduced <- readRDS("df_reduced.rds")
country_data_combined <- readRDS("country_data_combined.rds")

# first calculate country aggregates for individual variables so that we have
# continuous data for which we can more easily calculate correlations

df_reduced_aggregated <- df_reduced %>%
  group_by(country_name, isocntry) %>%
  summarize(
    
    # target
    pct_support_trans = mean(qc19 == 1, na.rm = TRUE) * 100,
    pct_oppose_trans = mean(qc19 == 2, na.rm = TRUE) * 100,
    pct_dk_trans = mean(qc19 == 3, na.rm = TRUE) * 100,
    
    # demographics
    mean_age = mean(age, na.rm = TRUE),
    pct_female = mean(gender == 2, na.rm = TRUE) * 100,
    mean_education_years = mean(education, na.rm = TRUE),
    pct_high_educ = mean(d8r2 >= 3, na.rm = TRUE) * 100,
    pct_rural = mean(urban_rural == 1, na.rm = TRUE) * 100,
    pct_urban = mean(urban_rural == 2, na.rm = TRUE) * 100,
    pct_large_urban = mean(urban_rural == 3, na.rm = TRUE) * 100,
    pct_financial_difficulty = mean(financial_insecurity <= 2, na.rm = TRUE) * 100,
    pct_working_class = mean(social_class == 1, na.rm = TRUE) * 100,
    pct_middle_class = mean(social_class %in% c(2,3,4), na.rm = TRUE) * 100,
    pct_upper_class = mean(social_class == 5, na.rm = TRUE) * 100,
    
    # values and identity
    mean_political_ideology = mean(political_ideology, na.rm = TRUE),
    pct_left = mean(political_ideology <= 3, na.rm = TRUE) * 100,
    pct_center = mean(political_ideology > 3 & political_ideology < 8, na.rm = TRUE) * 100,
    pct_right = mean(political_ideology >= 8, na.rm = TRUE) * 100,
    
    # religious composition
    pct_religious = mean(religion %in% c(1:11), na.rm = TRUE) * 100,
    pct_catholic = mean(religion == 1, na.rm = TRUE) * 100,
    pct_orthodox = mean(religion == 2, na.rm = TRUE) * 100,
    pct_protestant = mean(religion == 3, na.rm = TRUE) * 100,
    pct_other_christian = mean(religion == 4, na.rm = TRUE) * 100,
    pct_muslim = mean(religion %in% c(6:8), na.rm = TRUE) * 100,
    pct_other_religion = mean(religion %in% c(9:11), na.rm = TRUE) * 100,
    pct_atheist = mean(religion == 13, na.rm = TRUE) * 100,
    pct_nonbeliever = mean(religion == 14, na.rm = TRUE) * 100,
    pct_nonreligious = mean(religion %in% c(13,14), na.rm = TRUE) * 100,
    
    # social networks and contact
    pct_lgb_friends = mean(lgb_friends == 1, na.rm = TRUE) * 100,
    pct_trans_friends = mean(trans_friends == 1, na.rm = TRUE) * 100,
    
    # personal identity/minority status
    pct_ethnic_minority = mean(ethnic_minority == 1, na.rm = TRUE) * 100,
    pct_skin_color_minority = mean(skin_color_minority == 1, na.rm = TRUE) * 100,
    pct_religious_minority = mean(religious_minority == 1, na.rm = TRUE) * 100,
    pct_sexual_minority = mean(sexual_lgbt_minority == 1, na.rm = TRUE) * 100,
    pct_disability_minority = mean(disability_minority == 1, na.rm = TRUE) * 100,
    pct_other_minority = mean(other_minority == 1, na.rm = TRUE) * 100,
    pct_no_minority = mean(none_minority == 1, na.rm = TRUE) * 100,
    
    # discrimination perceptions; we need to be careful with interpretation/ coding
    mean_trans_discrim_country = mean(trans_discrimination_country, na.rm = TRUE),
    pct_perceive_trans_discrim = mean(trans_discrimination_country %in% c(1,2), na.rm = TRUE) * 100,
    pct_experienced_trans_discrim = mean(trans_discrimination_personal == 1, na.rm = TRUE) * 100,
    pct_workplace_trans_discrim = mean(trans_discrimination_workplace == 1, na.rm = TRUE) * 100,
    
    # political representation comfort - original scale (1-10, higher = more comfortable)
    mean_trans_political_comfort = mean(trans_discrimination_political, na.rm = TRUE),
    pct_comfortable_trans_political = mean(trans_discrimination_political >= 7, na.rm = TRUE) * 100,
    
    # effectiveness of discrimination efforts (original scale: 1-10, higher = more effective)
    mean_country_discrimination_efforts = mean(country_discrimination_efforts, na.rm = TRUE),
    
    # workplace diversity (1=Yes definitely, 4=No definitely not)
    mean_trans_workplace_diversity = mean(trans_workplace_diversity, na.rm = TRUE),
    pct_support_trans_workplace_diversity = mean(trans_workplace_diversity %in% c(1,2), na.rm = TRUE) * 100,
    
    # LGBT attitudes - REVERSED for these scales where 1=Totally agree, 4=Totally disagree
    # Create reversed versions so higher = more supportive
    mean_lgb_rights_rev = mean(case_when(
      lgb_rights == 1 ~ 4,
      lgb_rights == 2 ~ 3,
      lgb_rights == 3 ~ 2,
      lgb_rights == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_same_sex_relationship_rev = mean(case_when(
      same_sex_relationship == 1 ~ 4,
      same_sex_relationship == 2 ~ 3,
      same_sex_relationship == 3 ~ 2,
      same_sex_relationship == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_same_sex_marriage_rev = mean(case_when(
      same_sex_marriage == 1 ~ 4,
      same_sex_marriage == 2 ~ 3,
      same_sex_marriage == 3 ~ 2,
      same_sex_marriage == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    # also calculate percentage who agree with LGBT rights (original values 1 or 2)
    pct_support_lgb_rights = mean(lgb_rights %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_same_sex_relationship = mean(same_sex_relationship %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_same_sex_marriage = mean(same_sex_marriage %in% c(1,2), na.rm = TRUE) * 100,
    
    # school materials (1=Totally agree, 4=Totally disagree)
    # Create reversed versions so higher = more supportive
    mean_lgb_school_materials_rev = mean(case_when(
      lgb_school_materials == 1 ~ 4,
      lgb_school_materials == 2 ~ 3,
      lgb_school_materials == 3 ~ 2,
      lgb_school_materials == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_trans_school_materials_rev = mean(case_when(
      trans_school_materials == 1 ~ 4,
      trans_school_materials == 2 ~ 3,
      trans_school_materials == 3 ~ 2,
      trans_school_materials == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    mean_intersex_school_materials_rev = mean(case_when(
      intersex_school_materials == 1 ~ 4,
      intersex_school_materials == 2 ~ 3,
      intersex_school_materials == 3 ~ 2,
      intersex_school_materials == 4 ~ 1,
      TRUE ~ NA_real_), na.rm = TRUE),
    
    # also calculate percentage who agree with inclusive school materials
    pct_support_lgb_school_materials = mean(lgb_school_materials %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_trans_school_materials = mean(trans_school_materials %in% c(1,2), na.rm = TRUE) * 100,
    pct_support_intersex_school_materials = mean(intersex_school_materials %in% c(1,2), na.rm = TRUE) * 100,
    
    # comfort with same-sex public displays of affection (1-10)
    mean_men_pda_comfort = mean(two_men_public_affection, na.rm = TRUE),
    mean_women_pda_comfort = mean(two_women_public_affection, na.rm = TRUE),
    pct_comfortable_men_pda = mean(two_men_public_affection >= 7, na.rm = TRUE) * 100,
    pct_comfortable_women_pda = mean(two_women_public_affection >= 7, na.rm = TRUE) * 100,
    
    # comfort with transgender colleagues/children (1-10)
    mean_trans_colleague_comfort = mean(trans_colleague, na.rm = TRUE),
    mean_trans_child_relationship = mean(trans_child_relationship, na.rm = TRUE),
    pct_comfortable_trans_colleague = mean(trans_colleague >= 7, na.rm = TRUE) * 100,
    pct_comfortable_trans_child = mean(trans_child_relationship >= 7, na.rm = TRUE) * 100,
    
    # support for non-gendered documents
    pct_support_nongendered_docs = mean(non_gendered_docs == 1, na.rm = TRUE) * 100,
    pct_oppose_nongendered_docs = mean(non_gendered_docs == 2, na.rm = TRUE) * 100,
    pct_dk_nongendered_docs = mean(non_gendered_docs == 3, na.rm = TRUE) * 100) %>%
  ungroup()
## `summarise()` has grouped output by 'country_name'. You can override using the
## `.groups` argument.
# add country-level variables from the original dataset
# extract country-level variables that don't need to be aggregated
country_level_variables <- df_reduced %>%
  select(country_name, gdp_2018, rainbow_score_2019, rainbow_score_2018, 
         gender_equality_index, Happiness_Score, 
         v2x_libdem, v2x_egaldem, Regime_type, composite_equality, 
         z_composite_equality, norm_composite_equality, z_lgbt_support, 
         norm_lgbt_support, pct_friends_lgbt, z_pct_friends_lgbt, 
         norm_pct_friends_lgbt, mean_left_right, pct_high_education, region) %>%
  distinct()

# merge with aggregated data
df_reduced_aggregated_complete <- df_reduced_aggregated %>%
  left_join(country_level_variables, by = "country_name")

country_data_combined_add <- country_data_combined %>%
  select(iso2, norm_gdp_growth, norm_Unemployment, pct_trade_support, norm_gdp_2018,
         z_gender_equality, z_rainbow_score, z_v2x_libdem, z_v2x_egaldem, z_happiness,
         norm_gdp_growth, z_v2x_gender)

df_reduced_aggregated_complete <- df_reduced_aggregated_complete %>%
  left_join(country_data_combined_add, by = c("isocntry" = "iso2"))

# Check the final dataset
dim(df_reduced_aggregated_complete)
## [1]  28 100
# save it
write_rds(df_reduced_aggregated_complete, "df_reduced_aggregated_complete")


### create different plots
# calculate correlation matrix for numeric variables
cor_matrix <- cor(select_if(df_reduced_aggregated_complete, is.numeric), 
                  use = "pairwise.complete.obs")
## Warning in cor(select_if(df_reduced_aggregated_complete, is.numeric), use =
## "pairwise.complete.obs"): the standard deviation is zero
# (1) visualize complete correlation matrix
corrplot(cor_matrix, method = "color", type = "upper", 
         diag = F,
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.7, tl.cex = 0.7)

# (2) create custom function to analyze correlations with outcome
analyze_correlations_with_outcome <- function(data, outcome_var, min_correlation = 0.3) {
  # numeric columns only
  numeric_data <- data %>% select(where(is.numeric))
  
  # calculate correlations with outcome
  cors <- cor(numeric_data, numeric_data[[outcome_var]], use = "pairwise.complete.obs")
  
  # convert to data frame
  cor_df <- data.frame(
    variable = rownames(cors),
    correlation = cors[,1]) %>%
    # remove the outcome variable correlating with itself
    filter(variable != outcome_var) %>%
    # sort by absolute correlation
    arrange(desc(abs(correlation)))
  
  # filter by minimum correlation threshold
  cor_df <- cor_df %>% filter(abs(correlation) >= min_correlation)
  
  return(cor_df)
}

# run correlation analysis for transgender support
trans_support_correlations <- analyze_correlations_with_outcome(
  df_reduced_aggregated_complete, "pct_support_trans", min_correlation = 0.25)
## Warning in cor(numeric_data, numeric_data[[outcome_var]], use =
## "pairwise.complete.obs"): the standard deviation is zero
# display results
print(trans_support_correlations)
##                                                                    variable
## pct_oppose_trans                                           pct_oppose_trans
## pct_support_nongendered_docs                   pct_support_nongendered_docs
## pct_oppose_nongendered_docs                     pct_oppose_nongendered_docs
## pct_support_lgb_school_materials           pct_support_lgb_school_materials
## z_lgbt_support                                               z_lgbt_support
## norm_lgbt_support                                         norm_lgbt_support
## pct_support_lgb_rights                               pct_support_lgb_rights
## mean_lgb_rights_rev                                     mean_lgb_rights_rev
## pct_support_intersex_school_materials pct_support_intersex_school_materials
## pct_support_trans_school_materials       pct_support_trans_school_materials
## pct_support_same_sex_marriage                 pct_support_same_sex_marriage
## pct_support_same_sex_relationship         pct_support_same_sex_relationship
## pct_comfortable_trans_colleague             pct_comfortable_trans_colleague
## mean_trans_colleague_comfort                   mean_trans_colleague_comfort
## mean_lgb_school_materials_rev                 mean_lgb_school_materials_rev
## mean_same_sex_marriage_rev                       mean_same_sex_marriage_rev
## mean_same_sex_relationship_rev               mean_same_sex_relationship_rev
## mean_intersex_school_materials_rev       mean_intersex_school_materials_rev
## mean_trans_school_materials_rev             mean_trans_school_materials_rev
## mean_men_pda_comfort                                   mean_men_pda_comfort
## norm_pct_friends_lgbt                                 norm_pct_friends_lgbt
## pct_lgb_friends                                             pct_lgb_friends
## pct_friends_lgbt                                           pct_friends_lgbt
## z_pct_friends_lgbt                                       z_pct_friends_lgbt
## pct_comfortable_men_pda                             pct_comfortable_men_pda
## mean_women_pda_comfort                               mean_women_pda_comfort
## pct_comfortable_trans_political             pct_comfortable_trans_political
## pct_comfortable_women_pda                         pct_comfortable_women_pda
## mean_trans_political_comfort                   mean_trans_political_comfort
## mean_trans_child_relationship                 mean_trans_child_relationship
## pct_comfortable_trans_child                     pct_comfortable_trans_child
## z_v2x_egaldem                                                 z_v2x_egaldem
## v2x_egaldem                                                     v2x_egaldem
## composite_equality                                       composite_equality
## z_composite_equality                                   z_composite_equality
## norm_composite_equality                             norm_composite_equality
## rainbow_score_2018                                       rainbow_score_2018
## z_v2x_libdem                                                   z_v2x_libdem
## v2x_libdem                                                       v2x_libdem
## rainbow_score_2019                                       rainbow_score_2019
## z_rainbow_score                                             z_rainbow_score
## z_gender_equality                                         z_gender_equality
## gender_equality_index                                 gender_equality_index
## pct_right                                                         pct_right
## pct_trans_friends                                         pct_trans_friends
## mean_political_ideology                             mean_political_ideology
## mean_left_right                                             mean_left_right
## Happiness_Score                                             Happiness_Score
## z_happiness                                                     z_happiness
## norm_gdp_2018                                                 norm_gdp_2018
## gdp_2018                                                           gdp_2018
## pct_center                                                       pct_center
## norm_gdp_growth                                             norm_gdp_growth
## mean_trans_discrim_country                       mean_trans_discrim_country
## pct_perceive_trans_discrim                       pct_perceive_trans_discrim
## pct_high_education                                       pct_high_education
## pct_high_educ                                                 pct_high_educ
## pct_workplace_trans_discrim                     pct_workplace_trans_discrim
## pct_financial_difficulty                           pct_financial_difficulty
## pct_female                                                       pct_female
## pct_left                                                           pct_left
## mean_age                                                           mean_age
## pct_atheist                                                     pct_atheist
## pct_trade_support                                         pct_trade_support
## z_v2x_gender                                                   z_v2x_gender
## pct_orthodox                                                   pct_orthodox
## pct_religious                                                 pct_religious
## pct_nonreligious                                           pct_nonreligious
## pct_experienced_trans_discrim                 pct_experienced_trans_discrim
## pct_urban                                                         pct_urban
## pct_support_trans_workplace_diversity pct_support_trans_workplace_diversity
## pct_no_minority                                             pct_no_minority
## pct_large_urban                                             pct_large_urban
## pct_protestant                                               pct_protestant
## pct_other_religion                                       pct_other_religion
## pct_ethnic_minority                                     pct_ethnic_minority
## mean_country_discrimination_efforts     mean_country_discrimination_efforts
##                                       correlation
## pct_oppose_trans                       -1.0000000
## pct_support_nongendered_docs            0.9552632
## pct_oppose_nongendered_docs            -0.9552632
## pct_support_lgb_school_materials        0.9099915
## z_lgbt_support                          0.9055394
## norm_lgbt_support                       0.9055394
## pct_support_lgb_rights                  0.9054333
## mean_lgb_rights_rev                     0.8873981
## pct_support_intersex_school_materials   0.8853638
## pct_support_trans_school_materials      0.8828781
## pct_support_same_sex_marriage           0.8820719
## pct_support_same_sex_relationship       0.8799193
## pct_comfortable_trans_colleague         0.8795309
## mean_trans_colleague_comfort            0.8697944
## mean_lgb_school_materials_rev           0.8679107
## mean_same_sex_marriage_rev              0.8669420
## mean_same_sex_relationship_rev          0.8656658
## mean_intersex_school_materials_rev      0.8479568
## mean_trans_school_materials_rev         0.8459165
## mean_men_pda_comfort                    0.8398369
## norm_pct_friends_lgbt                   0.8350491
## pct_lgb_friends                         0.8350491
## pct_friends_lgbt                        0.8350491
## z_pct_friends_lgbt                      0.8350491
## pct_comfortable_men_pda                 0.8313883
## mean_women_pda_comfort                  0.8219693
## pct_comfortable_trans_political         0.8144761
## pct_comfortable_women_pda               0.8135901
## mean_trans_political_comfort            0.8080428
## mean_trans_child_relationship           0.8008226
## pct_comfortable_trans_child             0.7938314
## z_v2x_egaldem                           0.7909474
## v2x_egaldem                             0.7909474
## composite_equality                      0.7752634
## z_composite_equality                    0.7752634
## norm_composite_equality                 0.7752634
## rainbow_score_2018                      0.7539898
## z_v2x_libdem                            0.7147897
## v2x_libdem                              0.7147897
## rainbow_score_2019                      0.7094428
## z_rainbow_score                         0.7094428
## z_gender_equality                       0.7035475
## gender_equality_index                   0.7035475
## pct_right                              -0.6890892
## pct_trans_friends                       0.6648839
## mean_political_ideology                -0.6422553
## mean_left_right                        -0.6422553
## Happiness_Score                         0.6310032
## z_happiness                             0.6310032
## norm_gdp_2018                           0.5771817
## gdp_2018                                0.5771817
## pct_center                              0.5523491
## norm_gdp_growth                        -0.5522688
## mean_trans_discrim_country             -0.5172765
## pct_perceive_trans_discrim              0.5008079
## pct_high_education                      0.4980007
## pct_high_educ                           0.4948097
## pct_workplace_trans_discrim             0.4664769
## pct_financial_difficulty               -0.4504279
## pct_female                             -0.4496147
## pct_left                                0.4253939
## mean_age                                0.4017389
## pct_atheist                             0.4001147
## pct_trade_support                       0.3991507
## z_v2x_gender                            0.3977959
## pct_orthodox                           -0.3976753
## pct_religious                          -0.3825894
## pct_nonreligious                        0.3724599
## pct_experienced_trans_discrim          -0.3707182
## pct_urban                               0.3530711
## pct_support_trans_workplace_diversity   0.3494914
## pct_no_minority                         0.3146637
## pct_large_urban                        -0.3090152
## pct_protestant                          0.3082064
## pct_other_religion                      0.2785021
## pct_ethnic_minority                    -0.2752098
## mean_country_discrimination_efforts     0.2604523
# create a visualization of top correlations
ggplot(trans_support_correlations %>% head(15), 
       aes(x = reorder(variable, correlation), y = correlation)) +
  geom_bar(stat = "identity", aes(fill = correlation > 0)) +
  coord_flip() +
  labs(title = "Top correlated variables of transgender rights support",
       subtitle = "Country-level aggregated data",
       x = "Variables",
       y = "Correlation with % supporting transgender document changes") +
  theme_minimal() +
  scale_fill_manual(values = c("firebrick", "steelblue"), 
                    name = "Direction",
                    labels = c("Negative", "Positive"))

# (3) create correlation matrix for key variables
key_vars <- c("pct_support_trans", 
              trans_support_correlations$variable[1:25], # top 25 correlates
              "gdp_2018", "rainbow_score_2019", "gender_equality_index")

key_cor_matrix <- cor(df_reduced_aggregated_complete[, key_vars], 
                      use = "pairwise.complete.obs")

corrplot(key_cor_matrix, method = "color",
         order = "hclust", diag = F,
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.6, tl.cex = 0.7)

# (4) create correlation matrix for all country variables
key_vars_country <- c("pct_support_trans", "norm_gdp_2018", "rainbow_score_2019","z_gender_equality", "norm_Unemployment", "norm_gdp_growth", "z_v2x_libdem", "z_v2x_egaldem","z_happiness", "z_v2x_gender")

key_cor_matrix_country <- cor(df_reduced_aggregated_complete[, key_vars_country], 
                      use = "pairwise.complete.obs")

corrplot(key_cor_matrix_country, method = "color",
         order = "hclust", diag = F, addCoef.col = "black",
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.6, tl.cex = 0.7)

### 
# regional comparisons
region_summary <- df_reduced_aggregated_complete %>%
  group_by(region) %>%
  summarize(
    n_countries = n(),
    avg_trans_support = mean(pct_support_trans),
    min_support = min(pct_support_trans),
    max_support = max(pct_support_trans),
    std_dev = sd(pct_support_trans)) %>%
  arrange(desc(avg_trans_support))

# visualize regional differences
ggplot(df_reduced_aggregated_complete, aes(x = reorder(region, pct_support_trans, FUN = mean), 
                                           y = pct_support_trans)) +
  geom_boxplot(aes(fill = region)) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  coord_flip() +
  labs(title = "Support for Transgender Rights by European Region",
       x = "Region",
       y = "% Supporting Transgender Rights") +
  theme_minimal() +
  theme(legend.position = "none")

### some regression
# create a scatterplot matrix with regression lines
ggplot(df_reduced_aggregated_complete, 
       aes(x = pct_nonreligious, y = pct_support_trans)) +
  geom_point(aes(size = rainbow_score_2019, color = region)) +
  geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
       x = "% Non-religious Population",
       y = "% Supporting Transgender Rights",
       size = "Rainbow Score",
       color = "Region") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df_reduced_aggregated_complete, 
       aes(x = gender_equality_index, y = pct_support_trans)) +
  geom_point(aes(size = rainbow_score_2019, color = region)) +
  geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Gender Equality and Transgender Rights Support",
       x = "Gender Equality Index",
       y = "% Supporting Transgender Rights",
       size = "Rainbow Score",
       color = "Region") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

### clusters
cluster_vars <- c("pct_support_trans", "norm_gdp_2018", 
                  "rainbow_score_2019", "gender_equality_index", 
                  "v2x_libdem", "pct_catholic", "pct_nonreligious")

# Prepare data
cluster_data <- df_reduced_aggregated_complete %>%
  select(country_name, all_of(cluster_vars)) %>%
  na.omit() %>%
  column_to_rownames("country_name")

# Scale the data
cluster_data_scaled <- scale(cluster_data)

# Compute distance matrix
dist_matrix <- dist(cluster_data_scaled, method = "euclidean")

# Hierarchical clustering
hc <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
plot(hc, main = "Clustering of Countries by Support for Transgender Rights",
     sub = "", xlab = "", cex = 0.7)
rect.hclust(hc, k = 4, border = "red")

### summary table
country_summary <- df_reduced_aggregated_complete %>%
  select(country_name, pct_support_trans, rainbow_score_2019, 
         gender_equality_index, norm_gdp_2018,
         pct_nonreligious, v2x_libdem, gdp_2018) %>%
  arrange(desc(pct_support_trans))

kable(country_summary, 
      col.names = c("Country", "Trans Rights Support (%)", "Rainbow Score", 
                    "Gender Equality", "LGBT Comfort", 
                    "Non-religious (%)", "Liberal Democracy", "GDP 2018"),
      digits = 1) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(0, bold = TRUE) %>%
  add_header_above(c(" " = 1, "Support Measures" = 2, "Equality Measures" = 2, 
                     "Social Factors" = 2, "Economic" = 1))
Support Measures
Equality Measures
Social Factors
Economic
Country Trans Rights Support (%) Rainbow Score Gender Equality LGBT Comfort Non-religious (%) Liberal Democracy GDP 2018
Spain 89.1 58.5 68.3 0.2 21.7 0.8 41073.7
Netherlands 86.8 49.7 72.9 0.4 47.6 0.8 58818.0
Malta 82.2 74.7 60.1 0.3 4.8 0.6 48127.7
Denmark 79.5 60.3 76.8 0.4 14.2 0.9 57231.3
France 79.0 62.7 72.6 0.2 22.0 0.8 46397.5
Luxembourg 79.0 41.3 69.0 1.0 20.8 0.8 116334.0
Germany 78.9 47.5 65.5 0.3 28.1 0.8 56272.9
Portugal 75.4 57.5 56.0 0.1 9.7 0.8 34725.2
Ireland 74.1 44.7 69.5 0.7 10.0 0.8 86433.7
Sweden 73.1 52.9 82.6 0.3 37.2 0.9 53122.5
Belgium 73.0 70.4 70.5 0.3 24.9 0.8 52466.5
Finland 71.9 62.2 73.0 0.3 17.0 0.8 49242.7
United Kingdom 70.8 67.6 69.6 0.2 29.4 0.8 47108.0
Austria 61.8 48.5 63.3 0.4 21.5 0.8 56654.5
Greece 59.9 44.8 50.0 0.1 1.8 0.8 29792.0
Estonia 59.1 34.4 56.7 0.1 36.4 0.8 37201.1
Cyprus 53.5 22.0 55.1 0.2 2.0 0.7 40922.7
Slovenia 52.8 39.8 68.4 0.2 6.8 0.7 38656.2
Italy 49.9 19.4 62.1 0.2 12.2 0.8 43583.4
Poland 47.2 18.1 56.8 0.1 4.6 0.5 32360.9
Latvia 46.5 16.8 57.9 0.1 20.1 0.7 29832.1
Croatia 44.8 45.8 53.1 0.1 7.3 0.6 29043.9
Czech Republic 43.8 25.5 53.6 0.2 48.2 0.7 42016.4
Lithuania 41.7 19.9 56.8 0.1 7.0 0.8 36491.9
Slovakia 28.2 29.0 52.4 0.1 8.5 0.7 31514.2
Romania 23.6 20.7 52.4 0.1 1.6 0.6 29569.6
Bulgaria 16.7 25.8 58.0 0.0 4.3 0.5 24052.2
Hungary 16.2 42.8 50.8 0.1 17.4 0.4 32258.3
### check some bivariate relationships; choose three key demographic variblaes
# (1) age
ggplot(df_reduced, aes(x = age, y = as.numeric(qc19 == 1))) +
  geom_jitter(alpha = 0.1, height = 0.05) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Relationship Between Age and Transgender Rights Support",
       x = "Age", y = "Probability of Support") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# looks like age has a non-linear effect as the curve is flat for ages until ~60 and then falls off
# this suggests we should use a quadratic term for age

# (2) political ideology
ggplot(df_reduced, aes(x = political_ideology, y = as.numeric(qc19 == 1))) +
  geom_jitter(alpha = 0.1, height = 0.05) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Relationship Between Political Ideology and Transgender Rights Support",
       x = "Political Ideology (Left to Right)", 
       y = "Probability of Support") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# (3a) religion
ggplot(df_reduced_aggregated_complete, aes(x = pct_nonreligious, y = pct_support_trans)) +
  geom_point(aes(size = rainbow_score_2019, color = region), alpha = 0.8) +
  geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 15) +
  geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
  labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
       subtitle = "Country-level data from Special Eurobarometer 493 (2019)",
       x = "% Non-religious Population",
       y = "% Supporting Transgender Document Changes",
       size = "Rainbow Score",
       color = "Region") +
  theme_minimal() 
## `geom_smooth()` using formula = 'y ~ x'

# (3b) religion disaggregated
religion_support <- df_reduced %>%
  mutate(religion_group = case_when(
    religion == 1 ~ "Catholic",
    religion == 2 ~ "Orthodox",
    religion == 3 ~ "Protestant",
    religion == 4 ~ "Other Christian",
    religion %in% c(6:8) ~ "Muslim",
    religion %in% c(9:11) ~ "Other Religion",
    religion == 13 ~ "Atheist",
    religion == 14 ~ "Non-believer",
    TRUE ~ "Other/Missing")) %>%
  group_by(religion_group) %>%
  summarize(
    support_rate = mean(qc19 == 1, na.rm = TRUE) * 100,
    sample_size = n(),
    se = sqrt((support_rate/100 * (1-support_rate/100)) / sample_size) * 100) %>%
  filter(religion_group != "Other/Missing", sample_size >= 30) %>%
  arrange(desc(support_rate))

ggplot(religion_support, aes(x = reorder(religion_group, support_rate), y = support_rate)) +
  geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
  geom_errorbar(aes(ymin = support_rate - 1.96*se, 
                    ymax = support_rate + 1.96*se),
                width = 0.2) +
  geom_text(aes(label = sprintf("%.1f%%", support_rate), y = support_rate + 3),
            vjust = 0) +
  geom_text(aes(label = paste0("n=", sample_size), y = -5),
            vjust = 1, size = 3) +
  coord_flip() +
  labs(title = "Support for Transgender Rights by Religious Affiliation",
       subtitle = "Percentage supporting transgender document changes by religion",
       x = NULL,
       y = "Support Rate (%)") +
  ylim(-5, max(religion_support$support_rate) + 10) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.y = element_text(face = "bold"))

Based on the results of our descriptive analysis, we should use the following variables in the multilevel regression model: z_rainbow_score, z_gender_equality, regions, z_v2x_libdem, z_v2z_egaldem, norm_gdp_2018

### use descriptive ML for predictor selection
# prepare data for random forest by selecting variables we considered before
df_reduced$region <- as.factor(df_reduced$region)
df_reduced$Regime_type <- as.factor(df_reduced$Regime_type)
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 1, 1, ifelse(df_reduced$qc19 == 2, 0, NA)) %>%
  as.factor() # convert target to binary

# remove NAs due to the coding before
df_reduced <- df_reduced %>%
  na.omit()

# join
df_reduced_rf <- df_reduced %>%
  left_join(country_data_combined, by = "country_name") 

model_data <- df_reduced_rf %>%
  select(region.x, z_v2x_libdem, z_v2x_egaldem, z_v2x_freexp, z_v2x_gender, z_v2x_liberal,
         z_gender_equality_index, z_rainbow_score_2019, norm_gdp_2018, norm_Unemployment,
         norm_Happiness_Score, norm_gdp_growth, z_Functioning_of_government,
         z_Electoral_process_and_pluralism, qc19)

# split data into training and testing
set.seed(69420)
train_index <- createDataPartition(model_data$qc19, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]

# train Random Forest model
rf_model <- ranger(
  formula = qc19 ~ ., 
  data = train_data,
  importance = "impurity", 
  num.trees = 500,
  mtry = floor(sqrt(ncol(train_data) - 1)), 
  probability = TRUE
)

# extract variable importance correctly from ranger
var_importance <- rf_model$variable.importance

# ensure it is a numeric named vector before converting to a data frame
var_importance_df <- data.frame(
  Variable = names(var_importance),
  Importance = as.numeric(var_importance) 
)

# sort the data frame by importance in descending order
var_importance_df <- var_importance_df[order(var_importance_df$Importance, decreasing = TRUE), ]

# plot variable importance
ggplot(var_importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "Variable Importance from Random Forest",
    x = "Variables",
    y = "Importance"
  )

We see similar variables are most important using descriptive ML, reinforcing our previous decision.

Explanatory Modeling

#Load data
data <- read_dta("data/raw/ZA7575.dta")
df_reduced <- readRDS("df_reduced.rds")

# restore the old NAs and do CCA
df_reduced$qc19 <- data$qc19
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 3, NA, df_reduced$qc19)

df_reduced <- df_reduced %>%
  drop_na(qc19)
  
# quick data check
table(df_reduced$country_name)
## 
##        Austria        Belgium       Bulgaria        Croatia         Cyprus 
##            957           1014            738            912            399 
## Czech Republic        Denmark        Estonia        Finland         France 
##            919            930            803            883            915 
##        Germany         Greece        Hungary        Ireland          Italy 
##           1333            923            903            882            889 
##         Latvia      Lithuania     Luxembourg          Malta    Netherlands 
##            847            851            468            448            980 
##         Poland       Portugal        Romania       Slovakia       Slovenia 
##            818            911            905            886            920 
##          Spain         Sweden United Kingdom 
##            921            902            901

Visualize qc19 by country

# Calculate the percentage of "Yes" (1) responses for qc19 by country
country_means <- aggregate(qc19 == 1 ~ country_name, data = df_reduced, FUN = mean)

# Multiply by 100 to get percentage and then sort
country_means$percentage <- country_means$`qc19 == 1` * 100
country_means <- country_means[order(country_means$percentage), ]

# Create the plot
ggplot(country_means, aes(x = reorder(country_name, percentage), y = percentage)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(title = "Support for Transgender Rights by Country",
       subtitle = "Percentage answering 'Yes' to allowing transgender people to change civil documents",
       x = "Country",
       y = "Support (%)") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Prepare for GLM

# scale some of the variables (both categorical and continuous) with larger scales
df_reduced <- df_reduced %>%
  mutate(
    age_z = scale(age),
    religion_z = scale(as.numeric(religion)),
    political_ideology_z = scale(political_ideology),
    respondent_cooperation_z = scale(as.numeric(respondent_cooperation)),
    rainbow_score_z = scale(rainbow_score_2019),
    v2x_egaldem_z = scale(v2x_egaldem))

# Recode qc19 from 1 and 2 to 0 and 1 for the binary model
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 1, 1, 0)

# Optimize the settings to avoid convergence problems in the glmer() functions
control = glmerControl(optimizer = "bobyqa", 
                       optCtrl = list(maxfun = 100000))

# Recode some of the variables such that higher values translate to a more positive
# attitude towards LGBT rights in general -> simplieies coefficient interpretation
df_reduced <- df_reduced %>% 
  mutate(trans_friends = ifelse(trans_friends == 1, 2, 1),  # trans_friends is sd1_7 (see Reduced dataset.R)
         lgb_friends = ifelse(lgb_friends == 1, 2, 1)) # lgb_friends is sd1_4 (see Reduced dataset.R)

1. Null model (intercept only)

# STEP 1: Null Model (Intercept-only model)
null_model <- glmer(qc19 ~ 1 + (1|country_name), family = "binomial", data = df_reduced)
summary(null_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: qc19 ~ 1 + (1 | country_name)
##    Data: df_reduced
## 
##      AIC      BIC   logLik deviance df.resid 
##  28212.6  28228.8 -14104.3  28208.6    24156 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0233 -0.9008  0.5066  0.6531  2.2755 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  country_name (Intercept) 0.9649   0.9823  
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.4495     0.1856   2.422   0.0154 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate ICC
performance::icc(null_model)
## # Intraclass Correlation Coefficient
## 
##     Adjusted ICC: 0.227
##   Unadjusted ICC: 0.227

The null model indicates a statistically significant baseline probability of support for transgender individuals obtaining official documents (intercept = 0.45, p = 0.015), with substantial country-level variation accounting for 22.7% of the total variance in this support (ICC = 0.227).

2. Individual-level predictors (level 1)

# STEP 2: Add Individual-Level Predictors (Level 1)
# starting with predictors that showed correlation with qc19
model1 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1|country_name), 
               family = "binomial", 
               control = control,
               data = df_reduced)
summary(model1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +  
##     trans_friends + respondent_cooperation_z + (1 | country_name)
##    Data: df_reduced
## Control: control
## 
##      AIC      BIC   logLik deviance df.resid 
##  26411.0  26483.8 -13196.5  26393.0    24149 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.1228 -0.7612  0.3617  0.6576  4.7443 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  country_name (Intercept) 0.6956   0.834   
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.79076    0.17981  -9.959  < 2e-16 ***
## age_z                    -0.08163    0.01613  -5.060 4.20e-07 ***
## gender                    0.38278    0.03068  12.477  < 2e-16 ***
## religion_z                0.10468    0.01767   5.924 3.14e-09 ***
## political_ideology_z     -0.13256    0.01552  -8.543  < 2e-16 ***
## lgb_friends               0.88187    0.03648  24.173  < 2e-16 ***
## trans_friends             0.40647    0.06001   6.773 1.26e-11 ***
## respondent_cooperation_z -0.27127    0.01610 -16.845  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age_z  gender rlgn_z pltc__ lgb_fr trns_f
## age_z       -0.070                                          
## gender      -0.246 -0.022                                   
## religion_z   0.014  0.119  0.098                            
## pltcl_dlgy_ -0.021  0.045  0.007  0.070                     
## lgb_friends -0.187  0.211 -0.043 -0.087  0.032              
## trans_frnds -0.297  0.036 -0.005 -0.025  0.015 -0.224       
## rspndnt_cp_ -0.021 -0.078 -0.023 -0.004 -0.005  0.068  0.017
# compare with the null_model
anova(null_model, model1)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
##            npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)    
## null_model    2 28213 28229 -14104    28209                         
## model1        9 26411 26484 -13196    26393 1815.7  7  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
  list("Model 1" = null_model, 
       "Model 2" = model1),
  stars = TRUE,
  gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
Model 1 Model 2
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.449* -1.791***
(0.186) (0.180)
age_z -0.082***
(0.016)
gender 0.383***
(0.031)
religion_z 0.105***
(0.018)
political_ideology_z -0.133***
(0.016)
lgb_friends 0.882***
(0.036)
trans_friends 0.406***
(0.060)
respondent_cooperation_z -0.271***
(0.016)
SD (Intercept country_name) 0.982 0.834
Num.Obs. 24158 24158
AIC 28212.6 26411.0
BIC 28228.8 26483.8

Model 1 significantly improves fit over the null model (χ² = 1720.5, p < 0.001, lower AIC/BIC). Transgender documentation is positively associated with characteristics such as being female and having LGB/transgender friends, while negatively associated with age and respondent cooperation during the survey.

3. Country-level predictors (level 2)

# STEP 3: Add Country-Level Predictors (Level 2)

# We test three different models based on 'themes'. As we only have 28 level-2 units, 
# we should not add more than 2-3 country-level variables to each model --> overfitting

# Political/ Economic development model
model_econ_pol <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + 
                   trans_friends + respondent_cooperation_z +
                   scale(gdp_2018) + z_functioning_of_government + 
                   (1 + lgb_friends|country_name), 
                   family = "binomial", data = df_reduced)

# Social wellbeing model
model_wellbeing <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + 
                        trans_friends + respondent_cooperation_z +
                        scale(Happiness_Score) + scale(gender_equality_index) + 
                        (1 + lgb_friends|country_name), 
                        family = "binomial", data = df_reduced)

# LGBTQ/equality values model 
model_equality <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends +       respondent_cooperation_z +
                 rainbow_score_z + v2x_egaldem_z + (1|country_name), 
               family = "binomial", 
               control = control,
               data = df_reduced)

# decide which model is the best
anova(model_econ_pol, model_wellbeing, model_equality) # equality is the best
## Data: df_reduced
## Models:
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model_econ_pol: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + scale(gdp_2018) + z_functioning_of_government + (1 + lgb_friends | country_name)
## model_wellbeing: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + scale(Happiness_Score) + scale(gender_equality_index) + (1 + lgb_friends | country_name)
##                 npar   AIC   BIC logLik deviance Chisq Df Pr(>Chisq)
## model_equality    11 26382 26471 -13180    26360                    
## model_econ_pol    13 26401 26506 -13188    26375     0  2          1
## model_wellbeing   13 26404 26509 -13189    26378     0  0
# check whether the equality model can be improved
model_equality_2 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends +       respondent_cooperation_z +
                 rainbow_score_z + v2x_egaldem_z + z_functioning_of_government + 
                   (1|country_name), 
               family = "binomial", 
               control = control,
               data = df_reduced)

anova(model_equality, model_equality_2) # no improvement --> keep model_equality
## Data: df_reduced
## Models:
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model_equality_2: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + z_functioning_of_government + (1 | country_name)
##                  npar   AIC   BIC logLik deviance  Chisq Df Pr(>Chisq)
## model_equality     11 26382 26471 -13180    26360                     
## model_equality_2   12 26383 26480 -13180    26359 1.0893  1     0.2966
# test for multicollinearity among the two cultural country-level variables
vif_model <- lm(qc19 ~ rainbow_score_z + v2x_egaldem_z, data = df_reduced)
vif(vif_model)  # no multicollinearity issues --> keep model_equality as it is
## rainbow_score_z   v2x_egaldem_z 
##        1.179562        1.179562
# compare with the previous two models
anova(null_model, model1, model_equality)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
##                npar   AIC   BIC logLik deviance    Chisq Df Pr(>Chisq)    
## null_model        2 28213 28229 -14104    28209                           
## model1            9 26411 26484 -13196    26393 1815.669  7  < 2.2e-16 ***
## model_equality   11 26382 26471 -13180    26360   32.612  2  8.285e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
  list("Model 1" = null_model, 
       "Model 2" = model1,
       "Model 3" = model_equality),
  stars = TRUE,
  gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
Model 1 Model 2 Model 3
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.449* -1.791*** -1.763***
(0.186) (0.180) (0.122)
age_z -0.082*** -0.083***
(0.016) (0.016)
gender 0.383*** 0.384***
(0.031) (0.031)
religion_z 0.105*** 0.102***
(0.018) (0.018)
political_ideology_z -0.133*** -0.132***
(0.016) (0.016)
lgb_friends 0.882*** 0.876***
(0.036) (0.036)
trans_friends 0.406*** 0.407***
(0.060) (0.060)
respondent_cooperation_z -0.271*** -0.269***
(0.016) (0.016)
rainbow_score_z 0.354***
(0.091)
v2x_egaldem_z 0.480***
(0.095)
SD (Intercept country_name) 0.982 0.834 0.458
Num.Obs. 24158 24158 24158
AIC 28212.6 26411.0 26382.4
BIC 28228.8 26483.8 26471.4

Model 2 improves fit yet again by adding country-level predictors (χ² = 33.7, p < 0.001, AIC reduced by 29.7 points). The strongest predictors are LGB friends (0.88), egalitarian democracy (0.49), transgender friends (0.42), and LGBTQ legal protections (rainbow score, 0.37), while individual-level predictors remained consistent with Model 1.

4. Random slopes for individual-level predictors

# STEP 4: Add Random Slopes for Individual-Level Predictors
# Let's add random slopes for lgb_friends which has shown a strong effect
model3 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + 
                rainbow_score_z + v2x_egaldem_z + 
                (1 + lgb_friends|country_name), 
               family = "binomial", 
               data = df_reduced)
summary(model3)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +  
##     trans_friends + respondent_cooperation_z + rainbow_score_z +  
##     v2x_egaldem_z + (1 + lgb_friends | country_name)
##    Data: df_reduced
## 
##      AIC      BIC   logLik deviance df.resid 
##  26379.2  26484.4 -13176.6  26353.2    24145 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.8970 -0.7605  0.3654  0.6560  4.7324 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev. Corr 
##  country_name (Intercept) 0.28167  0.5307        
##               lgb_friends 0.02986  0.1728   -0.53
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.75034    0.13324 -13.136  < 2e-16 ***
## age_z                    -0.08369    0.01617  -5.177 2.25e-07 ***
## gender                    0.38596    0.03072  12.565  < 2e-16 ***
## religion_z                0.10190    0.01769   5.761 8.36e-09 ***
## political_ideology_z     -0.13125    0.01556  -8.437  < 2e-16 ***
## lgb_friends               0.87288    0.05021  17.385  < 2e-16 ***
## trans_friends             0.40446    0.06011   6.728 1.72e-11 ***
## respondent_cooperation_z -0.26967    0.01612 -16.734  < 2e-16 ***
## rainbow_score_z           0.36167    0.09015   4.012 6.02e-05 ***
## v2x_egaldem_z             0.47996    0.09428   5.091 3.57e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age_z  gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z       -0.098                                                        
## gender      -0.332 -0.022                                                 
## religion_z   0.018  0.120  0.098                                          
## pltcl_dlgy_ -0.031  0.046  0.007  0.070                                   
## lgb_friends -0.466  0.158 -0.029 -0.059  0.027                            
## trans_frnds -0.400  0.036 -0.005 -0.025  0.015 -0.166                     
## rspndnt_cp_ -0.028 -0.078 -0.024 -0.003 -0.004  0.048  0.018              
## ranbw_scr_z  0.030 -0.032  0.013 -0.003  0.014 -0.029 -0.008  0.001       
## v2x_egldm_z  0.006 -0.012  0.012 -0.020  0.011 -0.008  0.012  0.011 -0.346
# compare with previous three models
anova(null_model, model1, model_equality, model3)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
##                npar   AIC   BIC logLik deviance     Chisq Df Pr(>Chisq)    
## null_model        2 28213 28229 -14104    28209                            
## model1            9 26411 26484 -13196    26393 1815.6686  7  < 2.2e-16 ***
## model_equality   11 26382 26471 -13180    26360   32.6125  2  8.285e-08 ***
## model3           13 26379 26484 -13177    26353    7.1246  2    0.02837 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
  list("Model 1" = null_model, 
       "Model 2" = model1,
       "Model 3" = model_equality,
       "Model 4" = model3),
  stars = TRUE,
  gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
Model 1 Model 2 Model 3 Model 4
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.449* -1.791*** -1.763*** -1.750***
(0.186) (0.180) (0.122) (0.133)
age_z -0.082*** -0.083*** -0.084***
(0.016) (0.016) (0.016)
gender 0.383*** 0.384*** 0.386***
(0.031) (0.031) (0.031)
religion_z 0.105*** 0.102*** 0.102***
(0.018) (0.018) (0.018)
political_ideology_z -0.133*** -0.132*** -0.131***
(0.016) (0.016) (0.016)
lgb_friends 0.882*** 0.876*** 0.873***
(0.036) (0.036) (0.050)
trans_friends 0.406*** 0.407*** 0.404***
(0.060) (0.060) (0.060)
respondent_cooperation_z -0.271*** -0.269*** -0.270***
(0.016) (0.016) (0.016)
rainbow_score_z 0.354*** 0.362***
(0.091) (0.090)
v2x_egaldem_z 0.480*** 0.480***
(0.095) (0.094)
SD (Intercept country_name) 0.982 0.834 0.458 0.531
SD (lgb_friends country_name) 0.173
Cor (Intercept~lgb_friends country_name) -0.532
Num.Obs. 24158 24158 24158 24158
AIC 28212.6 26411.0 26382.4 26379.2
BIC 28228.8 26483.8 26471.4 26484.4

Model 3 slightly improves fit by adding random slopes for LGB friends (χ² = 7.92, p = 0.019). There is modest cross-country variation in the effect of having LGB friends (SD = 0.176), with a negative correlation (-0.48) between intercepts and slopes indicating stronger effects in countries with lower baseline support.

5. Cross-level interactions

# STEP 5: Add Cross-Level Interactions
# Assuming the random slope for lgb_friends is significant
# We'll test cross-level interactions with both country-level predictors
model4a <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + 
                rainbow_score_z + v2x_egaldem_z + 
                lgb_friends:rainbow_score_z +
                (1 + lgb_friends|country_name), 
               family = "binomial", data = df_reduced)
summary(model4a)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +  
##     trans_friends + respondent_cooperation_z + rainbow_score_z +  
##     v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends |  
##     country_name)
##    Data: df_reduced
## 
##      AIC      BIC   logLik deviance df.resid 
##  26381.2  26494.5 -13176.6  26353.2    24144 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.8950 -0.7604  0.3654  0.6560  4.7324 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev. Corr 
##  country_name (Intercept) 0.28176  0.5308        
##               lgb_friends 0.02985  0.1728   -0.53
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.7502822  0.1332809 -13.132  < 2e-16 ***
## age_z                       -0.0836921  0.0161660  -5.177 2.25e-07 ***
## gender                       0.3859664  0.0307213  12.563  < 2e-16 ***
## religion_z                   0.1019082  0.0176897   5.761 8.37e-09 ***
## political_ideology_z        -0.1312573  0.0155631  -8.434  < 2e-16 ***
## lgb_friends                  0.8728531  0.0502206  17.380  < 2e-16 ***
## trans_friends                0.4044711  0.0601146   6.728 1.72e-11 ***
## respondent_cooperation_z    -0.2696768  0.0161184 -16.731  < 2e-16 ***
## rainbow_score_z              0.3625479  0.1144465   3.168  0.00154 ** 
## v2x_egaldem_z                0.4799307  0.0943035   5.089 3.60e-07 ***
## lgb_friends:rainbow_score_z -0.0005866  0.0481628  -0.012  0.99028    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age_z  gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z       -0.098                                                        
## gender      -0.331 -0.022                                                 
## religion_z   0.018  0.120  0.098                                          
## pltcl_dlgy_ -0.031  0.046  0.007  0.070                                   
## lgb_friends -0.466  0.159 -0.030 -0.059  0.028                            
## trans_frnds -0.400  0.036 -0.005 -0.025  0.015 -0.167                     
## rspndnt_cp_ -0.029 -0.078 -0.024 -0.003 -0.003  0.049  0.017              
## ranbw_scr_z  0.038 -0.026  0.020  0.006 -0.007 -0.038 -0.002 -0.012       
## v2x_egldm_z  0.006 -0.012  0.012 -0.020  0.012 -0.007  0.012  0.011 -0.286
## lgb_frnd:__ -0.024  0.002 -0.016 -0.013  0.029  0.025 -0.008  0.020 -0.616
##             v2x_g_
## age_z             
## gender            
## religion_z        
## pltcl_dlgy_       
## lgb_friends       
## trans_frnds       
## rspndnt_cp_       
## ranbw_scr_z       
## v2x_egldm_z       
## lgb_frnd:__  0.022
# Alternative interaction with v2x_egaldem
model4b <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + 
                rainbow_score_z+ v2x_egaldem_z + 
                lgb_friends:v2x_egaldem_z +
                (1 + lgb_friends|country_name), 
                family = "binomial",
               data = df_reduced)
summary(model4b)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +  
##     trans_friends + respondent_cooperation_z + rainbow_score_z +  
##     v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends |  
##     country_name)
##    Data: df_reduced
## 
##      AIC      BIC   logLik deviance df.resid 
##  26380.7  26494.0 -13176.3  26352.7    24144 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.9289 -0.7612  0.3665  0.6548  4.7850 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev. Corr 
##  country_name (Intercept) 0.28103  0.5301        
##               lgb_friends 0.02859  0.1691   -0.53
## Number of obs: 24158, groups:  country_name, 28
## 
## Fixed effects:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.74930    0.13319 -13.134  < 2e-16 ***
## age_z                     -0.08385    0.01617  -5.187 2.14e-07 ***
## gender                     0.38602    0.03072  12.566  < 2e-16 ***
## religion_z                 0.10193    0.01769   5.763 8.24e-09 ***
## political_ideology_z      -0.13156    0.01556  -8.453  < 2e-16 ***
## lgb_friends                0.87464    0.04979  17.568  < 2e-16 ***
## trans_friends              0.40301    0.06009   6.706 1.99e-11 ***
## respondent_cooperation_z  -0.26977    0.01612 -16.738  < 2e-16 ***
## rainbow_score_z            0.36184    0.09011   4.016 5.93e-05 ***
## v2x_egaldem_z              0.53462    0.11836   4.517 6.27e-06 ***
## lgb_friends:v2x_egaldem_z -0.03826    0.05046  -0.758    0.448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age_z  gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z       -0.098                                                        
## gender      -0.332 -0.022                                                 
## religion_z   0.018  0.120  0.098                                          
## pltcl_dlgy_ -0.031  0.047  0.007  0.070                                   
## lgb_friends -0.464  0.159 -0.030 -0.059  0.026                            
## trans_frnds -0.400  0.037 -0.005 -0.025  0.016 -0.169                     
## rspndnt_cp_ -0.028 -0.077 -0.024 -0.003 -0.004  0.048  0.018              
## ranbw_scr_z  0.030 -0.032  0.012 -0.003  0.014 -0.029 -0.008  0.001       
## v2x_egldm_z  0.019 -0.022  0.012 -0.016 -0.010  0.008 -0.010  0.004 -0.269
## lgb_frn:2__ -0.012  0.013 -0.004 -0.002  0.026 -0.046  0.033  0.009 -0.005
##             v2x_g_
## age_z             
## gender            
## religion_z        
## pltcl_dlgy_       
## lgb_friends       
## trans_frnds       
## rspndnt_cp_       
## ranbw_scr_z       
## v2x_egldm_z       
## lgb_frn:2__ -0.616
# compare with previous models
anova(null_model, model1, model_equality, model3, model4a, model4b)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## model4a: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends | country_name)
## model4b: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends | country_name)
##                npar   AIC   BIC logLik deviance     Chisq Df Pr(>Chisq)    
## null_model        2 28213 28229 -14104    28209                            
## model1            9 26411 26484 -13196    26393 1815.6686  7  < 2.2e-16 ***
## model_equality   11 26382 26471 -13180    26360   32.6125  2  8.285e-08 ***
## model3           13 26379 26484 -13177    26353    7.1246  2    0.02837 *  
## model4a          14 26381 26494 -13177    26353    0.0001  1    0.99024    
## model4b          14 26381 26494 -13176    26353    0.5685  0               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
  list("Model 1" = null_model, 
       "Model 2" = model1,
       "Model 3" = model_equality,
       "Model 4" = model3,
       "Model 5a" = model4a,
       "Model 5b" = model4b),
  stars = TRUE,
  gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
Model 1 Model 2 Model 3 Model 4 Model 5a Model 5b
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 0.449* -1.791*** -1.763*** -1.750*** -1.750*** -1.749***
(0.186) (0.180) (0.122) (0.133) (0.133) (0.133)
age_z -0.082*** -0.083*** -0.084*** -0.084*** -0.084***
(0.016) (0.016) (0.016) (0.016) (0.016)
gender 0.383*** 0.384*** 0.386*** 0.386*** 0.386***
(0.031) (0.031) (0.031) (0.031) (0.031)
religion_z 0.105*** 0.102*** 0.102*** 0.102*** 0.102***
(0.018) (0.018) (0.018) (0.018) (0.018)
political_ideology_z -0.133*** -0.132*** -0.131*** -0.131*** -0.132***
(0.016) (0.016) (0.016) (0.016) (0.016)
lgb_friends 0.882*** 0.876*** 0.873*** 0.873*** 0.875***
(0.036) (0.036) (0.050) (0.050) (0.050)
trans_friends 0.406*** 0.407*** 0.404*** 0.404*** 0.403***
(0.060) (0.060) (0.060) (0.060) (0.060)
respondent_cooperation_z -0.271*** -0.269*** -0.270*** -0.270*** -0.270***
(0.016) (0.016) (0.016) (0.016) (0.016)
rainbow_score_z 0.354*** 0.362*** 0.363** 0.362***
(0.091) (0.090) (0.114) (0.090)
v2x_egaldem_z 0.480*** 0.480*** 0.480*** 0.535***
(0.095) (0.094) (0.094) (0.118)
lgb_friends × rainbow_score_z -0.001
(0.048)
lgb_friends × v2x_egaldem_z -0.038
(0.050)
SD (Intercept country_name) 0.982 0.834 0.458 0.531 0.531 0.530
SD (lgb_friends country_name) 0.173 0.173 0.169
Cor (Intercept~lgb_friends country_name) -0.532 -0.532 -0.532
Num.Obs. 24158 24158 24158 24158 24158 24158
AIC 28212.6 26411.0 26382.4 26379.2 26381.2 26380.7
BIC 28228.8 26483.8 26471.4 26484.4 26494.5 26494.0

Testing cross-level interactions shows that neither the interaction between LGB friends and Rainbow score (p = 0.90) nor between LGB friends and egalitarian democracy (p = 0.41) is significant. Adding these interaction terms actually worsens model fit (AIC increases).

6. Compare models

# compare all models
anova(null_model, model1, model_equality, model3, model4a, model4b)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## model4a: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends | country_name)
## model4b: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends | country_name)
##                npar   AIC   BIC logLik deviance     Chisq Df Pr(>Chisq)    
## null_model        2 28213 28229 -14104    28209                            
## model1            9 26411 26484 -13196    26393 1815.6686  7  < 2.2e-16 ***
## model_equality   11 26382 26471 -13180    26360   32.6125  2  8.285e-08 ***
## model3           13 26379 26484 -13177    26353    7.1246  2    0.02837 *  
## model4a          14 26381 26494 -13177    26353    0.0001  1    0.99024    
## model4b          14 26381 26494 -13176    26353    0.5685  0               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# use Stargazer to display the output in a nice table
model_stats_extended <- function(models, model_names) {
  result <- data.frame(
    Model = character(),
    AIC = numeric(),
    BIC = numeric(),
    LogLik = numeric(),
    Deviance = numeric(),
    ChiSq = numeric(),
    Df = numeric(),
    p = numeric(),
    ICC = numeric(),
    stringsAsFactors = FALSE
  )
  
  # Extract basic statistics
  for (i in 1:length(models)) {
    model <- models[[i]]
    result[i, "Model"] <- model_names[i]
    result[i, "AIC"] <- AIC(model)
    result[i, "BIC"] <- BIC(model)
    result[i, "LogLik"] <- as.numeric(logLik(model))
    result[i, "Deviance"] <- deviance(model)
    result[i, "ICC"] <- performance::icc(model)$ICC_conditional
  }
  
  # Extract chi-square statistics from ANOVA comparison
  anova_result <- anova(models[[1]], models[[2]], models[[3]], models[[4]], models[[5]], models[[6]])
  
  for (i in 2:length(models)) {
    result[i, "ChiSq"] <- anova_result$Chisq[i]
    result[i, "Df"] <- anova_result$Df[i]
    result[i, "p"] <- anova_result$`Pr(>Chisq)`[i]
  }
  
  return(result)
}

# for the labels
model_names <- c("Null model", "Individual predictors", "Country-level predictors", 
                 "Random slopes", "Interaction (Rainbow)", "Interaction (Egalitarian)")

# extract statistics
all_models_stats <- model_stats_extended(
  list(null_model, model1, model_equality, model3, model4a, model4b),
  model_names)

# split into two separate tables because one would be too wide
main_models_table <- stargazer(null_model, model1, model_equality, model3, 
          type = "text",
          title = "Main Multilevel Models for Transgender Document Change Support",
          column.labels = model_names[1:4],
          digits = 3,
          star.cutoffs = c(0.05, 0.01, 0.001),
          intercept.bottom = FALSE,
          single.row = FALSE,
          model.numbers = FALSE,
          notes.append = FALSE,
          notes = "* p<0.05; ** p<0.01; *** p<0.001",
          add.lines = list(
            c("AIC", round(all_models_stats$AIC[1:4], 1)),
            c("BIC", round(all_models_stats$BIC[1:4], 1)),
            c("Log Likelihood", round(all_models_stats$LogLik[1:4], 1)),
            c("Chi-square", c("", round(all_models_stats$ChiSq[2:4], 2))),
            c("df", c("", all_models_stats$Df[2:4])),
            c("p-value", c("", format.pval(all_models_stats$p[2:4], digits = 3))),
            c("ICC", round(all_models_stats$ICC[1:4], 3))))
## 
## Main Multilevel Models for Transgender Document Change Support
## =================================================================================================
##                                                    Dependent variable:                           
##                          ------------------------------------------------------------------------
##                                                            qc19                                  
##                          Null model  Individual predictors Country-level predictors Random slopes
## -------------------------------------------------------------------------------------------------
## Constant                   0.449*          -1.791***              -1.763***           -1.750***  
##                            (0.186)          (0.180)                (0.122)             (0.133)   
##                                                                                                  
## age_z                                      -0.082***              -0.083***           -0.084***  
##                                             (0.016)                (0.016)             (0.016)   
##                                                                                                  
## gender                                     0.383***                0.384***           0.386***   
##                                             (0.031)                (0.031)             (0.031)   
##                                                                                                  
## religion_z                                 0.105***                0.102***           0.102***   
##                                             (0.018)                (0.018)             (0.018)   
##                                                                                                  
## political_ideology_z                       -0.133***              -0.132***           -0.131***  
##                                             (0.016)                (0.016)             (0.016)   
##                                                                                                  
## lgb_friends                                0.882***                0.876***           0.873***   
##                                             (0.036)                (0.036)             (0.050)   
##                                                                                                  
## trans_friends                              0.406***                0.407***           0.404***   
##                                             (0.060)                (0.060)             (0.060)   
##                                                                                                  
## respondent_cooperation_z                   -0.271***              -0.269***           -0.270***  
##                                             (0.016)                (0.016)             (0.016)   
##                                                                                                  
## rainbow_score_z                                                    0.354***           0.362***   
##                                                                    (0.091)             (0.090)   
##                                                                                                  
## v2x_egaldem_z                                                      0.480***           0.480***   
##                                                                    (0.095)             (0.094)   
##                                                                                                  
## -------------------------------------------------------------------------------------------------
## AIC                        28212.6           26411                 26382.4             26379.2   
## BIC                        28228.8          26483.8                26471.4             26484.4   
## Log Likelihood            -14104.3         -13196.5                -13180.2           -13176.6   
## Chi-square                                  1815.67                 32.61               7.12     
## df                                             7                      2                   2      
## p-value                                     < 2e-16                8.28e-08            0.0284    
## ICC                         0.227            0.154                  0.043               0.043    
## Observations               24,158           24,158                  24,158             24,158    
## Log Likelihood           -14,104.320      -13,196.490            -13,180.180         -13,176.620 
## Akaike Inf. Crit.        28,212.650       26,410.980              26,382.360         26,379.240  
## Bayesian Inf. Crit.      28,228.830       26,483.810              26,471.380         26,484.440  
## =================================================================================================
## Note:                                                            * p<0.05; ** p<0.01; *** p<0.001
interaction_models_table <- stargazer(model3, model4a, model4b, 
          type = "text",
          title = "Interaction Models for Transgender Document Change Support",
          column.labels = model_names[4:6],
          digits = 3,
          star.cutoffs = c(0.05, 0.01, 0.001),
          intercept.bottom = FALSE,
          single.row = FALSE,
          model.numbers = FALSE,
          notes.append = FALSE,
          notes = "* p<0.05; ** p<0.01; *** p<0.001",
          add.lines = list(
            c("AIC", round(all_models_stats$AIC[4:6], 1)),
            c("BIC", round(all_models_stats$BIC[4:6], 1)),
            c("Log Likelihood", round(all_models_stats$LogLik[4:6], 1)),
            c("Chi-square", c("", round(all_models_stats$ChiSq[5:6], 2))),
            c("df", c("", all_models_stats$Df[5:6])),
            c("p-value", c("", format.pval(all_models_stats$p[5:6], digits = 3))),
            c("ICC", round(all_models_stats$ICC[4:6], 3))))
## 
## Interaction Models for Transgender Document Change Support
## =========================================================================================
##                                                  Dependent variable:                     
##                             -------------------------------------------------------------
##                                                         qc19                             
##                             Random slopes Interaction (Rainbow) Interaction (Egalitarian)
## -----------------------------------------------------------------------------------------
## Constant                      -1.750***         -1.750***               -1.749***        
##                                (0.133)           (0.133)                 (0.133)         
##                                                                                          
## age_z                         -0.084***         -0.084***               -0.084***        
##                                (0.016)           (0.016)                 (0.016)         
##                                                                                          
## gender                        0.386***          0.386***                0.386***         
##                                (0.031)           (0.031)                 (0.031)         
##                                                                                          
## religion_z                    0.102***          0.102***                0.102***         
##                                (0.018)           (0.018)                 (0.018)         
##                                                                                          
## political_ideology_z          -0.131***         -0.131***               -0.132***        
##                                (0.016)           (0.016)                 (0.016)         
##                                                                                          
## lgb_friends                   0.873***          0.873***                0.875***         
##                                (0.050)           (0.050)                 (0.050)         
##                                                                                          
## trans_friends                 0.404***          0.404***                0.403***         
##                                (0.060)           (0.060)                 (0.060)         
##                                                                                          
## respondent_cooperation_z      -0.270***         -0.270***               -0.270***        
##                                (0.016)           (0.016)                 (0.016)         
##                                                                                          
## rainbow_score_z               0.362***           0.363**                0.362***         
##                                (0.090)           (0.114)                 (0.090)         
##                                                                                          
## v2x_egaldem_z                 0.480***          0.480***                0.535***         
##                                (0.094)           (0.094)                 (0.118)         
##                                                                                          
## lgb_friends:rainbow_score_z                      -0.001                                  
##                                                  (0.048)                                 
##                                                                                          
## lgb_friends:v2x_egaldem_z                                                -0.038          
##                                                                          (0.050)         
##                                                                                          
## -----------------------------------------------------------------------------------------
## AIC                            26379.2           26381.2                 26380.7         
## BIC                            26484.4           26494.5                  26494          
## Log Likelihood                -13176.6          -13176.6                -13176.3         
## Chi-square                                          0                     0.57           
## df                                                  1                       0            
## p-value                                           0.99                     NA            
## ICC                             0.043             0.043                   0.043          
## Observations                   24,158            24,158                  24,158          
## Log Likelihood               -13,176.620       -13,176.620             -13,176.330       
## Akaike Inf. Crit.            26,379.240        26,381.240              26,380.670        
## Bayesian Inf. Crit.          26,484.440        26,494.530              26,493.960        
## =========================================================================================
## Note:                                                    * p<0.05; ** p<0.01; *** p<0.001
# Get anova results for interaction models
anova_results_interactions <- anova(model3, model4a, model4b)

stargazer(model3, model4a, model4b, 
          type = "text",
          title = "Interaction Models for Transgender Document Change Support",
          column.labels = c("Random Slopes", "Rainbow Interaction", "Egalitarian Interaction"),
          covariate.labels = c("Age (z-score)", "Gender", "Religion (z-score)", 
                              "Political ideology (z-score)", "Has LGB friends", 
                              "Has transgender friends", "Respondent cooperation (z-score)",
                              "Rainbow score (z-score)", "Egalitarian democracy (z-score)",
                              "LGB friends × Rainbow score", 
                              "LGB friends × Egalitarian democracy"),
          digits = 3,
          star.cutoffs = c(0.05, 0.01, 0.001),
          add.lines = list(
            c("Chi-square", "", 
              round(anova_results_interactions$Chisq[2], 2), 
              round(anova_results_interactions$Chisq[3], 2)),
            c("df", "", 
              anova_results_interactions$Df[2], 
              anova_results_interactions$Df[3]),
            c("p-value", "", 
              format.pval(anova_results_interactions$`Pr(>Chisq)`[2], digits = 3),
              format.pval(anova_results_interactions$`Pr(>Chisq)`[3], digits = 3)),
            c("ICC", 
              round(performance::icc(model3)$ICC_conditional, 3),
              round(performance::icc(model4a)$ICC_conditional, 3),
              round(performance::icc(model4b)$ICC_conditional, 3))),
          notes = "* p<0.05; ** p<0.01; *** p<0.001",
          out = "interaction_models.txt")
## 
## Interaction Models for Transgender Document Change Support
## =============================================================================================
##                                                        Dependent variable:                   
##                                     ---------------------------------------------------------
##                                                               qc19                           
##                                     Random Slopes Rainbow Interaction Egalitarian Interaction
##                                          (1)              (2)                   (3)          
## ---------------------------------------------------------------------------------------------
## Age (z-score)                         -0.084***        -0.084***             -0.084***       
##                                        (0.016)          (0.016)               (0.016)        
##                                                                                              
## Gender                                0.386***         0.386***              0.386***        
##                                        (0.031)          (0.031)               (0.031)        
##                                                                                              
## Religion (z-score)                    0.102***         0.102***              0.102***        
##                                        (0.018)          (0.018)               (0.018)        
##                                                                                              
## Political ideology (z-score)          -0.131***        -0.131***             -0.132***       
##                                        (0.016)          (0.016)               (0.016)        
##                                                                                              
## Has LGB friends                       0.873***         0.873***              0.875***        
##                                        (0.050)          (0.050)               (0.050)        
##                                                                                              
## Has transgender friends               0.404***         0.404***              0.403***        
##                                        (0.060)          (0.060)               (0.060)        
##                                                                                              
## Respondent cooperation (z-score)      -0.270***        -0.270***             -0.270***       
##                                        (0.016)          (0.016)               (0.016)        
##                                                                                              
## Rainbow score (z-score)               0.362***          0.363**              0.362***        
##                                        (0.090)          (0.114)               (0.090)        
##                                                                                              
## Egalitarian democracy (z-score)       0.480***         0.480***              0.535***        
##                                        (0.094)          (0.094)               (0.118)        
##                                                                                              
## LGB friends × Rainbow score                             -0.001                               
##                                                         (0.048)                              
##                                                                                              
## LGB friends × Egalitarian democracy                                           -0.038         
##                                                                               (0.050)        
##                                                                                              
## Constant                              -1.750***        -1.750***             -1.749***       
##                                        (0.133)          (0.133)               (0.133)        
##                                                                                              
## ---------------------------------------------------------------------------------------------
## Chi-square                                                 0                   0.57          
## df                                                         1                     0           
## p-value                                                  0.99                   NA           
## ICC                                     0.043            0.043                 0.043         
## Observations                           24,158           24,158                24,158         
## Log Likelihood                       -13,176.620      -13,176.620           -13,176.330      
## Akaike Inf. Crit.                    26,379.240       26,381.240            26,380.670       
## Bayesian Inf. Crit.                  26,484.440       26,494.530            26,493.960       
## =============================================================================================
## Note:                                                           *p<0.05; **p<0.01; ***p<0.001
##                                                              * p<0.05; ** p<0.01; *** p<0.001
# Get anova results for all models
anova_results_all <- anova(null_model, model1, model_equality, model3, model4a, model4b)

stargazer(null_model, model1, model_equality, model3, model4a, model4b, 
          type = "text",
          title = "Multilevel Models for Transgender Document Change Support",
          column.labels = c("Null", "Individual", "Country", "Random Slopes", 
                           "Rainbow Int.", "Egalitarian Int."),
          covariate.labels = c("Age (z-score)", "Gender", "Religion (z-score)", 
                              "Political ideology (z-score)", "Has LGB friends", 
                              "Has transgender friends", "Respondent cooperation (z-score)",
                              "Rainbow score (z-score)", "Egalitarian democracy (z-score)",
                              "LGB friends × Rainbow score", 
                              "LGB friends × Egalitarian democracy"),
          digits = 3,
          star.cutoffs = c(0.05, 0.01, 0.001),
          add.lines = list(
            c("Chi-square", "", 
              round(anova_results_all$Chisq[2], 2), 
              round(anova_results_all$Chisq[3], 2),
              round(anova_results_all$Chisq[4], 2),
              round(anova_results_all$Chisq[5], 2),
              round(anova_results_all$Chisq[6], 2)),
            c("df", "", 
              anova_results_all$Df[2], 
              anova_results_all$Df[3],
              anova_results_all$Df[4],
              anova_results_all$Df[5],
              anova_results_all$Df[6]),
            c("p-value", "", 
              format.pval(anova_results_all$`Pr(>Chisq)`[2], digits = 3),
              format.pval(anova_results_all$`Pr(>Chisq)`[3], digits = 3),
              format.pval(anova_results_all$`Pr(>Chisq)`[4], digits = 3),
              format.pval(anova_results_all$`Pr(>Chisq)`[5], digits = 3),
              format.pval(anova_results_all$`Pr(>Chisq)`[6], digits = 3)),
            c("ICC", 
              round(performance::icc(null_model)$ICC_conditional, 3),
              round(performance::icc(model1)$ICC_conditional, 3),
              round(performance::icc(model_equality)$ICC_conditional, 3),
              round(performance::icc(model3)$ICC_conditional, 3),
              round(performance::icc(model4a)$ICC_conditional, 3),
              round(performance::icc(model4b)$ICC_conditional, 3))),
          notes = "* p<0.05; ** p<0.01; *** p<0.001",
          font.size = "small",  # Use smaller font to help fit the table
          out = "all_models.txt")
## 
## Multilevel Models for Transgender Document Change Support
## ===================================================================================================================
##                                                                   Dependent variable:                              
##                                     -------------------------------------------------------------------------------
##                                                                          qc19                                      
##                                        Null     Individual    Country   Random Slopes Rainbow Int. Egalitarian Int.
##                                         (1)         (2)         (3)          (4)          (5)            (6)       
## -------------------------------------------------------------------------------------------------------------------
## Age (z-score)                                    -0.082***   -0.083***    -0.084***    -0.084***      -0.084***    
##                                                   (0.016)     (0.016)      (0.016)      (0.016)        (0.016)     
##                                                                                                                    
## Gender                                           0.383***    0.384***     0.386***      0.386***       0.386***    
##                                                   (0.031)     (0.031)      (0.031)      (0.031)        (0.031)     
##                                                                                                                    
## Religion (z-score)                               0.105***    0.102***     0.102***      0.102***       0.102***    
##                                                   (0.018)     (0.018)      (0.018)      (0.018)        (0.018)     
##                                                                                                                    
## Political ideology (z-score)                     -0.133***   -0.132***    -0.131***    -0.131***      -0.132***    
##                                                   (0.016)     (0.016)      (0.016)      (0.016)        (0.016)     
##                                                                                                                    
## Has LGB friends                                  0.882***    0.876***     0.873***      0.873***       0.875***    
##                                                   (0.036)     (0.036)      (0.050)      (0.050)        (0.050)     
##                                                                                                                    
## Has transgender friends                          0.406***    0.407***     0.404***      0.404***       0.403***    
##                                                   (0.060)     (0.060)      (0.060)      (0.060)        (0.060)     
##                                                                                                                    
## Respondent cooperation (z-score)                 -0.271***   -0.269***    -0.270***    -0.270***      -0.270***    
##                                                   (0.016)     (0.016)      (0.016)      (0.016)        (0.016)     
##                                                                                                                    
## Rainbow score (z-score)                                      0.354***     0.362***      0.363**        0.362***    
##                                                               (0.091)      (0.090)      (0.114)        (0.090)     
##                                                                                                                    
## Egalitarian democracy (z-score)                              0.480***     0.480***      0.480***       0.535***    
##                                                               (0.095)      (0.094)      (0.094)        (0.118)     
##                                                                                                                    
## LGB friends × Rainbow score                                                              -0.001                    
##                                                                                         (0.048)                    
##                                                                                                                    
## LGB friends × Egalitarian democracy                                                                     -0.038     
##                                                                                                        (0.050)     
##                                                                                                                    
## Constant                              0.449*     -1.791***   -1.763***    -1.750***    -1.750***      -1.749***    
##                                       (0.186)     (0.180)     (0.122)      (0.133)      (0.133)        (0.133)     
##                                                                                                                    
## -------------------------------------------------------------------------------------------------------------------
## Chi-square                                        1815.67      32.61        7.12           0             0.57      
## df                                                   7           2            2            1              0        
## p-value                                           <2e-16     8.28e-08      0.0284         0.99            NA       
## ICC                                    0.227       0.154       0.043        0.043        0.043          0.043      
## Observations                          24,158      24,158      24,158       24,158        24,158         24,158     
## Log Likelihood                      -14,104.320 -13,196.490 -13,180.180  -13,176.620  -13,176.620    -13,176.330   
## Akaike Inf. Crit.                   28,212.650  26,410.980  26,382.360   26,379.240    26,381.240     26,380.670   
## Bayesian Inf. Crit.                 28,228.830  26,483.810  26,471.380   26,484.440    26,494.530     26,493.960   
## ===================================================================================================================
## Note:                                                                                 *p<0.05; **p<0.01; ***p<0.001
##                                                                                    * p<0.05; ** p<0.01; *** p<0.001

Model 3 (random slopes for LGB friends) is the best model, as it significantly improves fit over simpler models (χ² = 7.92, p = 0.019) and achieves the lowest AIC (26472.5). Subsequent interaction models fail to improve fit (p = 0.90, p = 0.41) and slightly increase AIC, confirming Model 3 as the most robust choice.

7. Variance explained

# For logistic regression, the level-1 variance is fixed at π²/3
pi_squared_by_3 <- (pi^2)/3  # approximately 3.29

# Get variance components from null model
var_null <- as.data.frame(VarCorr(null_model))
between_var_null <- var_null$vcov[1]  # Between-country variance
within_var_null <- pi_squared_by_3    # Fixed residual variance for logistic models

# Get variance components from final model (model3)
var_model3 <- as.data.frame(VarCorr(model3))
between_var_model3 <- var_model3$vcov[1]  # Between-country variance
within_var_model3 <- pi_squared_by_3      # Still fixed for the final model

# Calculate proportion of between-country variance explained
between_var_explained <- (between_var_null - between_var_model3) / between_var_null

# For binomial models, we can't directly calculate within-country variance explained
# We can use an approximation based on the R² measure for GLMMs
# Calculate total variance in both models
total_var_null <- between_var_null + within_var_null
total_var_model3 <- between_var_model3 + within_var_model3

# Calculate proportion of total variance explained
total_var_explained <- 1 - (total_var_model3 / total_var_null)

# Alternatively, use MuMIn package for R² calculation
library(MuMIn)
r2_model3 <- r.squaredGLMM(model3)
## Warning: the null model is only correct if all the variables it uses are identical 
## to those used in fitting the original model.
# This returns two values: R²m (marginal - fixed effects only) and R²c (conditional - fixed + random effects)

cat("Proportion of between-country variance explained:", round(between_var_explained * 100, 1), "%\n")
## Proportion of between-country variance explained: 70.8 %
cat("Proportion of total variance explained:", round(total_var_explained * 100, 1), "%\n")
## Proportion of total variance explained: 16.1 %
cat("R² marginal (fixed effects only):", round(r2_model3[1], 3), "\n")
## R² marginal (fixed effects only): 0.281
cat("R² conditional (fixed + random):", round(r2_model3[2], 3), "\n")
## R² conditional (fixed + random): 0.241

8. Vizualisations

library(ggplot2)
library(sjPlot)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
# 1. Plot fixed effects with error bars
p1 <- plot_model(model3, sort.est = TRUE, show.values = TRUE, value.offset = 0.3) +
  theme_bw() +
  labs(title = "Factors Influencing Support for Transgender Document Change",
       y = "Log-Odds (95% CI)")

# 2. Create a plot showing country variation
# Extract random effects for each country
re <- ranef(model3)$country_name
re_df <- data.frame(
  country = rownames(re),
  intercept = re$`(Intercept)`,
  lgb_slope = re$lgb_friends
)

# Sort by intercept
re_df <- re_df[order(re_df$intercept), ]
re_df$country <- factor(re_df$country, levels = re_df$country)

# Create the plot
p2 <- ggplot(re_df, aes(x = country, y = intercept)) +
  geom_point() +
  geom_errorbar(aes(ymin = intercept - 1.96*attr(re, "postVar")[1,1,]^0.5, 
                    ymax = intercept + 1.96*attr(re, "postVar")[1,1,]^0.5), 
                width = 0.2) +
  coord_flip() +
  theme_bw() +
  labs(title = "Country Differences in Support for Transgender Rights",
       subtitle = "Random intercepts with 95% confidence intervals",
       x = "",
       y = "Random Intercept")

# 3. Create effect plots for key predictors
# For Rainbow Score
p3 <- plot_model(model3, type = "pred", terms = "rainbow_score_z [-2:2]", 
                title = "Effect of Rainbow Score on Support Level", 
                axis.title = c("Rainbow Score (z-score)", "Probability of Support")) +
  theme_bw()
## You are calculating adjusted predictions on the population-level (i.e.
##   `type = "fixed"`) for a *generalized* linear mixed model.
##   This may produce biased estimates due to Jensen's inequality. Consider
##   setting `bias_correction = TRUE` to correct for this bias.
##   See also the documentation of the `bias_correction` argument.
# For Egalitarian Democracy
p4 <- plot_model(model3, type = "pred", terms = "v2x_egaldem_z [-2:2]", 
                title = "Effect of Egalitarian Democracy on Support Level",
                axis.title = c("Egalitarian Democracy (z-score)", "Probability of Support")) +
  theme_bw()

## Threshold plot
# (1) for religion
# simple approach to calculate thresholds
thresholds <- data.frame(
  country = unique(df_reduced$country_name),
  threshold = NA)

# Extract fixed effects
fixed_effects <- fixef(model3)
random_effects <- ranef(model3)$country_name

# Calculate threshold for each country based on model coefficients
for (i in 1:nrow(thresholds)) {
  country_i <- thresholds$country[i]
  
  # Get country-specific intercept
  intercept <- fixed_effects["(Intercept)"] + random_effects[country_i, "(Intercept)"]
  
  # Get religiosity coefficient
  relig_coef <- fixed_effects["religion_z"]
  
  # Calculate other effects (at reference/mean values)
  other_effects <- fixed_effects["lgb_friends"] * 2  # Has LGB friends
  if ("trans_friends" %in% names(fixed_effects))
    other_effects <- other_effects + fixed_effects["trans_friends"] * 1.5
  
  # Calculate religiosity threshold where log-odds = 0 (probability = 0.5)
  # Solve: intercept + relig_coef * threshold + other_effects = 0
  thresholds$threshold[i] <- -(intercept + other_effects) / relig_coef
}

# lot the thresholds
p5 <- ggplot(thresholds, aes(x = reorder(country, -threshold), y = threshold)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Religiosity Threshold for Supporting Transgender Rights by Country",
       subtitle = "Religiosity z-score at which support probability crosses 50%",
       x = "",
       y = "Religiosity Threshold (z-score)")

# (2) for political ideology
# Calculate political ideology thresholds
pol_thresholds <- data.frame(
  country = unique(df_reduced$country_name),
  threshold = NA
)

# Extract fixed effects
fixed_effects <- fixef(model3)
random_effects <- ranef(model3)$country_name

# Calculate threshold for each country based on model coefficients
for (i in 1:nrow(pol_thresholds)) {
  country_i <- pol_thresholds$country[i]
  
  # Get country-specific intercept
  intercept <- fixed_effects["(Intercept)"] + random_effects[country_i, "(Intercept)"]
  
  # Get political ideology coefficient
  pol_coef <- fixed_effects["political_ideology_z"]
  
  # Calculate other effects (at reference/mean values)
  other_effects <- fixed_effects["lgb_friends"] * 2 + # Has LGB friends
                  fixed_effects["trans_friends"] * 1.5 +
                  fixed_effects["religion_z"] * 0  # Set religion to mean
  
  # Calculate ideology threshold where log-odds = 0 (probability = 0.5)
  pol_thresholds$threshold[i] <- -(intercept + other_effects) / pol_coef
}

# Plot the thresholds
p6 <- ggplot(pol_thresholds, aes(x = reorder(country, -threshold), y = threshold)) +
  geom_bar(stat = "identity", fill = "darkred") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Political Ideology Threshold for Supporting Transgender Rights by Country",
       subtitle = "Political ideology z-score at which support probability crosses 50%",
       x = "",
       y = "Political Ideology Threshold (z-score)")

# Display plots
p1

p2

p3

p4

p5

p6